diff --git a/docs/src/main/sphinx/connector/iceberg.rst b/docs/src/main/sphinx/connector/iceberg.rst index e51daae6460b..b6105b55e90b 100644 --- a/docs/src/main/sphinx/connector/iceberg.rst +++ b/docs/src/main/sphinx/connector/iceberg.rst @@ -775,9 +775,9 @@ You can retrieve the information about the manifests of the Iceberg table .. code-block:: text - path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | partitions - ----------------------------------------------------------------------------------------------------------------+-----------------+----------------------+-----------------------+--------------------------+-----------------------------+-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------- - hdfs://hadoop-master:9000/user/hive/warehouse/test_table/metadata/faa19903-1455-4bb8-855a-61a1bbafbaa7-m0.avro | 6277 | 0 | 7860805980949777961 | 1 | 0 | 0 |{{contains_null=false, contains_nan= false, lower_bound=1, upper_bound=1},{contains_null=false, contains_nan= false, lower_bound=2021-01-12, upper_bound=2021-01-12}} + path | length | partition_spec_id | added_snapshot_id | added_data_files_count | added_rows_count | existing_data_files_count | existing_rows_count | deleted_data_files_count | deleted_rows_count | partitions + ----------------------------------------------------------------------------------------------------------------+-----------------+----------------------+-----------------------+-------------------------+------------------+-----------------------------+---------------------+-----------------------------+--------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------- + hdfs://hadoop-master:9000/user/hive/warehouse/test_table/metadata/faa19903-1455-4bb8-855a-61a1bbafbaa7-m0.avro | 6277 | 0 | 7860805980949777961 | 1 | 100 | 0 | 0 | 0 | 0 | {{contains_null=false, contains_nan= false, lower_bound=1, upper_bound=1},{contains_null=false, contains_nan= false, lower_bound=2021-01-12, upper_bound=2021-01-12}} The output of the query has the following columns: @@ -804,12 +804,21 @@ The output of the query has the following columns: * - ``added_data_files_count`` - ``integer`` - The number of data files with status ``ADDED`` in the manifest file + * - ``added_rows_count`` + - ``bigint`` + - The total number of rows in all data files with status ``ADDED`` in the manifest file. * - ``existing_data_files_count`` - ``integer`` - The number of data files with status ``EXISTING`` in the manifest file + * - ``existing_rows_count`` + - ``bigint`` + - The total number of rows in all data files with status ``EXISTING`` in the manifest file. * - ``deleted_data_files_count`` - ``integer`` - The number of data files with status ``DELETED`` in the manifest file + * - ``deleted_rows_count`` + - ``bigint`` + - The total number of rows in all data files with status ``DELETED`` in the manifest file. * - ``partitions`` - ``array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))`` - Partition range metadata diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ManifestsTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ManifestsTable.java index ca439ecdba9f..896b5a4feb08 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ManifestsTable.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ManifestsTable.java @@ -68,8 +68,11 @@ public ManifestsTable(SchemaTableName tableName, Table icebergTable, Optional buildPages(ConnectorTableMetadata tableMetadata, Table pagesBuilder.appendInteger(file.partitionSpecId()); pagesBuilder.appendBigint(file.snapshotId()); pagesBuilder.appendInteger(file.addedFilesCount()); + pagesBuilder.appendBigint(file.addedRowsCount()); pagesBuilder.appendInteger(file.existingFilesCount()); + pagesBuilder.appendBigint(file.existingRowsCount()); pagesBuilder.appendInteger(file.deletedFilesCount()); + pagesBuilder.appendBigint(file.deletedRowsCount()); writePartitionSummaries(pagesBuilder.nextColumn(), file.partitions(), partitionSpecsById.get(file.partitionSpecId())); pagesBuilder.endRow(); }); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSystemTables.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSystemTables.java index 7467262f9b48..17767bd160ee 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSystemTables.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSystemTables.java @@ -64,6 +64,19 @@ public void setUp() assertUpdate("INSERT INTO test_schema.test_table_nan VALUES (1, 1.1, 1.2, CAST('2022-01-01' AS DATE)), (2, nan(), 2.2, CAST('2022-01-02' AS DATE)), (3, 3.3, nan(), CAST('2022-01-03' AS DATE))", 3); assertUpdate("INSERT INTO test_schema.test_table_nan VALUES (4, nan(), 4.1, CAST('2022-01-04' AS DATE)), (5, 4.2, nan(), CAST('2022-01-04' AS DATE)), (6, nan(), nan(), CAST('2022-01-04' AS DATE))", 3); assertQuery("SELECT count(*) FROM test_schema.test_table_nan", "VALUES 6"); + + assertUpdate("CREATE TABLE test_schema.test_table_with_dml (_varchar VARCHAR, _date DATE) WITH (partitioning = ARRAY['_date'])"); + assertUpdate( + "INSERT INTO test_schema.test_table_with_dml " + + "VALUES " + + "('a1', DATE '2022-01-01'), ('a2', DATE '2022-01-01'), " + + "('b1', DATE '2022-02-02'), ('b2', DATE '2022-02-02'), " + + "('c1', DATE '2022-03-03'), ('c2', DATE '2022-03-03')", + 6); + assertUpdate("UPDATE test_schema.test_table_with_dml SET _varchar = 'a1.updated' WHERE _date = DATE '2022-01-01' AND _varchar = 'a1'", 1); + assertUpdate("DELETE FROM test_schema.test_table_with_dml WHERE _date = DATE '2022-02-02' AND _varchar = 'b2'", 1); + assertUpdate("INSERT INTO test_schema.test_table_with_dml VALUES ('c3', DATE '2022-03-03'), ('d1', DATE '2022-04-04')", 2); + assertQuery("SELECT count(*) FROM test_schema.test_table_with_dml", "VALUES 7"); } @AfterClass(alwaysRun = true) @@ -73,6 +86,7 @@ public void tearDown() assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_multilevel_partitions"); assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_drop_column"); assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_nan"); + assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_with_dml"); assertUpdate("DROP SCHEMA IF EXISTS test_schema"); } @@ -202,19 +216,38 @@ public void testManifestsTable() "('partition_spec_id', 'integer', '', '')," + "('added_snapshot_id', 'bigint', '', '')," + "('added_data_files_count', 'integer', '', '')," + + "('added_rows_count', 'bigint', '', '')," + "('existing_data_files_count', 'integer', '', '')," + + "('existing_rows_count', 'bigint', '', '')," + "('deleted_data_files_count', 'integer', '', '')," + + "('deleted_rows_count', 'bigint', '', '')," + "('partitions', 'array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))', '', '')"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table$manifests\""); - assertThat(query("SELECT partitions FROM test_schema.\"test_table$manifests\"")) - .matches("VALUES " + - " CAST(ARRAY[ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))) , " + - " CAST(ARRAY[ROW(false, false, '2019-09-09', '2019-09-10')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))"); + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table$manifests\"")) + .matches( + "VALUES " + + " (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))) , " + + " (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-09', '2019-09-10')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_multilevel_partitions$manifests\""); - assertThat(query("SELECT partitions FROM test_schema.\"test_table_multilevel_partitions$manifests\"")) - .matches("VALUES " + - " CAST(ARRAY[ROW(false, false, '0', '1'), ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))"); + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_multilevel_partitions$manifests\"")) + .matches( + "VALUES " + + "(3, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '0', '1'), ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))"); + + assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_with_dml$manifests\""); + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_with_dml$manifests\"")) + .matches( + "VALUES " + + // INSERT on '2022-01-01', '2022-02-02', '2022-03-03' partitions + "(3, BIGINT '0', BIGINT '6', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-03-03')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " + + // UPDATE on '2022-01-01' partition + "(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-01-01')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " + + "(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-01-01')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " + + // DELETE from '2022-02-02' partition + "(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-02-02', '2022-02-02')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " + + // INSERT on '2022-03-03', '2022-04-04' partitions + "(2, BIGINT '0', BIGINT '2', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-03-03', '2022-04-04')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))"); } @Test