Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions docs/src/main/sphinx/connector/iceberg.rst
Original file line number Diff line number Diff line change
Expand Up @@ -775,9 +775,9 @@ You can retrieve the information about the manifests of the Iceberg table

.. code-block:: text

path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | partitions
----------------------------------------------------------------------------------------------------------------+-----------------+----------------------+-----------------------+--------------------------+-----------------------------+-----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
hdfs://hadoop-master:9000/user/hive/warehouse/test_table/metadata/faa19903-1455-4bb8-855a-61a1bbafbaa7-m0.avro | 6277 | 0 | 7860805980949777961 | 1 | 0 | 0 |{{contains_null=false, contains_nan= false, lower_bound=1, upper_bound=1},{contains_null=false, contains_nan= false, lower_bound=2021-01-12, upper_bound=2021-01-12}}
path | length | partition_spec_id | added_snapshot_id | added_data_files_count | added_rows_count | existing_data_files_count | existing_rows_count | deleted_data_files_count | deleted_rows_count | partitions
----------------------------------------------------------------------------------------------------------------+-----------------+----------------------+-----------------------+-------------------------+------------------+-----------------------------+---------------------+-----------------------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------
hdfs://hadoop-master:9000/user/hive/warehouse/test_table/metadata/faa19903-1455-4bb8-855a-61a1bbafbaa7-m0.avro | 6277 | 0 | 7860805980949777961 | 1 | 100 | 0 | 0 | 0 | 0 | {{contains_null=false, contains_nan= false, lower_bound=1, upper_bound=1},{contains_null=false, contains_nan= false, lower_bound=2021-01-12, upper_bound=2021-01-12}}


The output of the query has the following columns:
Expand All @@ -804,12 +804,21 @@ The output of the query has the following columns:
* - ``added_data_files_count``
- ``integer``
- The number of data files with status ``ADDED`` in the manifest file
* - ``added_rows_count``
- ``bigint``
- The total number of rows in all data files with status ``ADDED`` in the manifest file.
* - ``existing_data_files_count``
- ``integer``
- The number of data files with status ``EXISTING`` in the manifest file
* - ``existing_rows_count``
- ``bigint``
- The total number of rows in all data files with status ``EXISTING`` in the manifest file.
* - ``deleted_data_files_count``
- ``integer``
- The number of data files with status ``DELETED`` in the manifest file
* - ``deleted_rows_count``
- ``bigint``
- The total number of rows in all data files with status ``DELETED`` in the manifest file.
* - ``partitions``
- ``array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))``
- Partition range metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,11 @@ public ManifestsTable(SchemaTableName tableName, Table icebergTable, Optional<Lo
.add(new ColumnMetadata("partition_spec_id", INTEGER))
.add(new ColumnMetadata("added_snapshot_id", BIGINT))
.add(new ColumnMetadata("added_data_files_count", INTEGER))
.add(new ColumnMetadata("added_rows_count", BIGINT))
.add(new ColumnMetadata("existing_data_files_count", INTEGER))
.add(new ColumnMetadata("existing_rows_count", BIGINT))
.add(new ColumnMetadata("deleted_data_files_count", INTEGER))
.add(new ColumnMetadata("deleted_rows_count", BIGINT))
.add(new ColumnMetadata("partitions", new ArrayType(RowType.rowType(
RowType.field("contains_null", BOOLEAN),
RowType.field("contains_nan", BOOLEAN),
Expand Down Expand Up @@ -118,8 +121,11 @@ private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, Table
pagesBuilder.appendInteger(file.partitionSpecId());
pagesBuilder.appendBigint(file.snapshotId());
pagesBuilder.appendInteger(file.addedFilesCount());
pagesBuilder.appendBigint(file.addedRowsCount());
pagesBuilder.appendInteger(file.existingFilesCount());
pagesBuilder.appendBigint(file.existingRowsCount());
pagesBuilder.appendInteger(file.deletedFilesCount());
pagesBuilder.appendBigint(file.deletedRowsCount());
writePartitionSummaries(pagesBuilder.nextColumn(), file.partitions(), partitionSpecsById.get(file.partitionSpecId()));
pagesBuilder.endRow();
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ public void setUp()
assertUpdate("INSERT INTO test_schema.test_table_nan VALUES (1, 1.1, 1.2, CAST('2022-01-01' AS DATE)), (2, nan(), 2.2, CAST('2022-01-02' AS DATE)), (3, 3.3, nan(), CAST('2022-01-03' AS DATE))", 3);
assertUpdate("INSERT INTO test_schema.test_table_nan VALUES (4, nan(), 4.1, CAST('2022-01-04' AS DATE)), (5, 4.2, nan(), CAST('2022-01-04' AS DATE)), (6, nan(), nan(), CAST('2022-01-04' AS DATE))", 3);
assertQuery("SELECT count(*) FROM test_schema.test_table_nan", "VALUES 6");

assertUpdate("CREATE TABLE test_schema.test_table_with_dml (_varchar VARCHAR, _date DATE) WITH (partitioning = ARRAY['_date'])");
assertUpdate(
"INSERT INTO test_schema.test_table_with_dml " +
"VALUES " +
"('a1', DATE '2022-01-01'), ('a2', DATE '2022-01-01'), " +
"('b1', DATE '2022-02-02'), ('b2', DATE '2022-02-02'), " +
"('c1', DATE '2022-03-03'), ('c2', DATE '2022-03-03')",
6);
assertUpdate("UPDATE test_schema.test_table_with_dml SET _varchar = 'a1.updated' WHERE _date = DATE '2022-01-01' AND _varchar = 'a1'", 1);
assertUpdate("DELETE FROM test_schema.test_table_with_dml WHERE _date = DATE '2022-02-02' AND _varchar = 'b2'", 1);
assertUpdate("INSERT INTO test_schema.test_table_with_dml VALUES ('c3', DATE '2022-03-03'), ('d1', DATE '2022-04-04')", 2);
assertQuery("SELECT count(*) FROM test_schema.test_table_with_dml", "VALUES 7");
Comment thread
ebyhr marked this conversation as resolved.
Outdated
}

@AfterClass(alwaysRun = true)
Expand All @@ -73,6 +86,7 @@ public void tearDown()
assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_multilevel_partitions");
assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_drop_column");
assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_nan");
assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_with_dml");
assertUpdate("DROP SCHEMA IF EXISTS test_schema");
}

Expand Down Expand Up @@ -202,19 +216,38 @@ public void testManifestsTable()
"('partition_spec_id', 'integer', '', '')," +
"('added_snapshot_id', 'bigint', '', '')," +
"('added_data_files_count', 'integer', '', '')," +
"('added_rows_count', 'bigint', '', '')," +
"('existing_data_files_count', 'integer', '', '')," +
"('existing_rows_count', 'bigint', '', '')," +
"('deleted_data_files_count', 'integer', '', '')," +
"('deleted_rows_count', 'bigint', '', '')," +
"('partitions', 'array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))', '', '')");
assertQuerySucceeds("SELECT * FROM test_schema.\"test_table$manifests\"");
assertThat(query("SELECT partitions FROM test_schema.\"test_table$manifests\""))
.matches("VALUES " +
" CAST(ARRAY[ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))) , " +
" CAST(ARRAY[ROW(false, false, '2019-09-09', '2019-09-10')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))");
assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table$manifests\""))
.matches(
"VALUES " +
" (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))) , " +
" (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-09', '2019-09-10')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))");

assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_multilevel_partitions$manifests\"");
assertThat(query("SELECT partitions FROM test_schema.\"test_table_multilevel_partitions$manifests\""))
.matches("VALUES " +
" CAST(ARRAY[ROW(false, false, '0', '1'), ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))");
assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_multilevel_partitions$manifests\""))
.matches(
"VALUES " +
"(3, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '0', '1'), ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))");

assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_with_dml$manifests\"");
assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_with_dml$manifests\""))
.matches(
"VALUES " +
// INSERT on '2022-01-01', '2022-02-02', '2022-03-03' partitions
"(3, BIGINT '0', BIGINT '6', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-03-03')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " +
// UPDATE on '2022-01-01' partition
"(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-01-01')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " +
"(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-01-01', '2022-01-01')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " +
// DELETE from '2022-02-02' partition
"(1, BIGINT '0', BIGINT '1', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-02-02', '2022-02-02')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))), " +
// INSERT on '2022-03-03', '2022-04-04' partitions
"(2, BIGINT '0', BIGINT '2', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2022-03-03', '2022-04-04')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))");
}

@Test
Expand Down