diff --git a/docs/src/main/sphinx/release/release-339.md b/docs/src/main/sphinx/release/release-339.md index 1e5944bd7c5d..986241ae66c2 100644 --- a/docs/src/main/sphinx/release/release-339.md +++ b/docs/src/main/sphinx/release/release-339.md @@ -31,7 +31,6 @@ ## Hive connector -* Add support for writing Bloom filters in ORC files. ({issue}`3939`) * Make `location` parameter optional for the `system.register_partition` procedure. ({issue}`4443`) * Avoid creating tiny splits at the end of block boundaries. ({issue}`4485`) * Remove requirement to configure `metastore.storage.schema.reader.impl` in Hive 3.x metastore diff --git a/docs/src/main/sphinx/release/release-365.md b/docs/src/main/sphinx/release/release-365.md index 3cd1b54b1ba6..615181154479 100644 --- a/docs/src/main/sphinx/release/release-365.md +++ b/docs/src/main/sphinx/release/release-365.md @@ -73,7 +73,6 @@ * Fix reporting of number of read bytes for tables using `ORC` file format. ({issue}`10048`) * Account for memory used for deleted row information when reading from ACID tables. ({issue}`9914`, {issue}`10070`) * Fix `REVOKE GRANT OPTION` to revoke only the grant option instead of revoking the entire privilege. ({issue}`10094`) -* Fix issue that prevented writing bloom filters for ORC files. ({issue}`9792`) * Fix bug where incorrect rows were deleted when deleting from a transactional table that has original files (before the first major compaction). ({issue}`10095`) * Fix delete and update failure when changing a table after a major compaction. ({issue}`10120`) * Fix incorrect results when decoding decimal values in Parquet reader. ({issue}`9971`) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java index 7854b233900c..7f9d94d5aaa5 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java @@ -317,8 +317,8 @@ public class HiveMetadata public static final String PRESTO_VIEW_COMMENT = "Presto View"; public static final String PRESTO_VIEW_EXPANDED_TEXT_MARKER = "/* Presto View */"; - private static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; - private static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + public static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; + public static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; public static final String SKIP_HEADER_COUNT_KEY = serdeConstants.HEADER_COUNT; public static final String SKIP_FOOTER_COUNT_KEY = serdeConstants.FOOTER_COUNT; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index dc5b66f89ad6..9ad460298910 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -123,10 +123,11 @@ import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE; import static io.trino.plugin.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND; import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; +import static io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; +import static io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY; import static io.trino.plugin.hive.HiveMetadata.SKIP_FOOTER_COUNT_KEY; import static io.trino.plugin.hive.HiveMetadata.SKIP_HEADER_COUNT_KEY; import static io.trino.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION; -import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS; import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_FPP; import static io.trino.plugin.hive.HiveType.toHiveTypes; import static io.trino.plugin.hive.metastore.SortingColumn.Order.ASCENDING; @@ -1083,18 +1084,18 @@ public static List getColumnTypes(Properties schema) public static OrcWriterOptions getOrcWriterOptions(Properties schema, OrcWriterOptions orcWriterOptions) { - if (schema.containsKey(ORC_BLOOM_FILTER_COLUMNS)) { - if (!schema.containsKey(ORC_BLOOM_FILTER_FPP)) { - throw new TrinoException(HIVE_INVALID_METADATA, format("FPP for bloom filter is missing")); + if (schema.containsKey(ORC_BLOOM_FILTER_COLUMNS_KEY)) { + if (!schema.containsKey(ORC_BLOOM_FILTER_FPP_KEY)) { + throw new TrinoException(HIVE_INVALID_METADATA, "FPP for bloom filter is missing"); } try { - double fpp = parseDouble(schema.getProperty(ORC_BLOOM_FILTER_FPP)); + double fpp = parseDouble(schema.getProperty(ORC_BLOOM_FILTER_FPP_KEY)); return orcWriterOptions - .withBloomFilterColumns(ImmutableSet.copyOf(COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(ORC_BLOOM_FILTER_COLUMNS)))) + .withBloomFilterColumns(ImmutableSet.copyOf(COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(ORC_BLOOM_FILTER_COLUMNS_KEY)))) .withBloomFilterFpp(fpp); } catch (NumberFormatException e) { - throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, format("Invalid value for %s property: %s", ORC_BLOOM_FILTER_FPP, schema.getProperty(ORC_BLOOM_FILTER_FPP))); + throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, format("Invalid value for %s property: %s", ORC_BLOOM_FILTER_FPP, schema.getProperty(ORC_BLOOM_FILTER_FPP_KEY))); } } return orcWriterOptions; diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWithBloomFilters.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWithBloomFilters.java new file mode 100644 index 000000000000..64b0151c860f --- /dev/null +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWithBloomFilters.java @@ -0,0 +1,108 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.orc; + +import io.trino.Session; +import io.trino.plugin.hive.HiveQueryRunner; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import org.intellij.lang.annotations.Language; +import org.testng.annotations.Test; + +import static io.trino.testing.sql.TestTable.randomTableSuffix; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestOrcWithBloomFilters + extends AbstractTestQueryFramework +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return HiveQueryRunner.builder() + .addHiveProperty("hive.orc.bloom-filters.enabled", "true") + .addHiveProperty("hive.orc.default-bloom-filter-fpp", "0.001") + .build(); + } + + @Test + public void testOrcBloomFilterIsWrittenDuringCreate() + { + String tableName = "create_orc_with_bloom_filters_" + randomTableSuffix(); + assertUpdate( + format( + "CREATE TABLE %s WITH (%s) AS SELECT orderstatus, totalprice FROM tpch.tiny.orders", + tableName, + getTableProperties("totalprice", "orderstatus", "totalprice")), + 15000); + + // `totalprice 51890 is chosen to lie between min/max values of row group + assertBloomFilterBasedRowGroupPruning(format("SELECT * FROM %s WHERE totalprice = 51890", tableName)); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testOrcBloomFilterIsWrittenDuringInsert() + { + String tableName = "insert_orc_with_bloom_filters_" + randomTableSuffix(); + assertUpdate( + format( + "CREATE TABLE %s (totalprice DOUBLE, orderstatus VARCHAR) WITH (%s)", + tableName, + getTableProperties("totalprice", "orderstatus", "totalprice"))); + assertUpdate(format("INSERT INTO %s SELECT totalprice, orderstatus FROM tpch.tiny.orders", tableName), 15000); + + // `totalprice 51890 is chosen to lie between min/max values of row group + assertBloomFilterBasedRowGroupPruning(format("SELECT * FROM %s WHERE totalprice = 51890", tableName)); + assertUpdate("DROP TABLE " + tableName); + } + + private String getTableProperties(String bloomFilterColumnName, String bucketingColumnName, String sortByColumnName) + { + return format( + "orc_bloom_filter_columns = ARRAY['%s'], bucketed_by = array['%s'], bucket_count = 1, sorted_by = ARRAY['%s']", + bloomFilterColumnName, + bucketingColumnName, + sortByColumnName); + } + + private void assertBloomFilterBasedRowGroupPruning(@Language("SQL") String sql) + { + assertQueryStats( + disableBloomFilters(getSession()), + sql, + queryStats -> { + assertThat(queryStats.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(queryStats.getProcessedInputPositions()).isEqualTo(queryStats.getPhysicalInputPositions()); + }, + results -> assertThat(results.getRowCount()).isEqualTo(0)); + + assertQueryStats( + getSession(), + sql, + queryStats -> { + assertThat(queryStats.getPhysicalInputPositions()).isEqualTo(0); + assertThat(queryStats.getProcessedInputPositions()).isEqualTo(0); + }, + results -> assertThat(results.getRowCount()).isEqualTo(0)); + } + + private Session disableBloomFilters(Session session) + { + return Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().orElseThrow(), "orc_bloom_filters_enabled", "false") + .build(); + } +} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWriterOptions.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWriterOptions.java index ccd1449e5163..a6b71af7b07a 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWriterOptions.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcWriterOptions.java @@ -24,8 +24,8 @@ import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.orc.OrcWriterOptions.WriterIdentification.LEGACY_HIVE_COMPATIBLE; import static io.trino.orc.OrcWriterOptions.WriterIdentification.TRINO; -import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS; -import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_FPP; +import static io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; +import static io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY; import static io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; @@ -78,8 +78,8 @@ public void testOrcWriterOptionsFromOrcWriterConfig() public void testOrcWriterOptionsFromTableProperties() { Properties tableProperties = new Properties(); - tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS, "column_a, column_b"); - tableProperties.setProperty(ORC_BLOOM_FILTER_FPP, "0.5"); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "column_a, column_b"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, "0.5"); OrcWriterOptions orcWriterOptions = getOrcWriterOptions(tableProperties, new OrcWriterOptions()); assertThat(orcWriterOptions.getBloomFilterFpp()).isEqualTo(0.5); assertThat(orcWriterOptions.isBloomFilterColumn("column_a")).isTrue(); @@ -91,8 +91,8 @@ public void testOrcWriterOptionsFromTableProperties() public void testOrcWriterOptionsWithInvalidFPPValue() { Properties tableProperties = new Properties(); - tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS, "column_with_bloom_filter"); - tableProperties.setProperty(ORC_BLOOM_FILTER_FPP, "abc"); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "column_with_bloom_filter"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, "abc"); assertThatThrownBy(() -> getOrcWriterOptions(tableProperties, new OrcWriterOptions())) .hasMessage("Invalid value for orc_bloom_filter_fpp property: abc"); } @@ -101,8 +101,8 @@ public void testOrcWriterOptionsWithInvalidFPPValue() public void testOrcBloomFilterWithInvalidRange(String fpp) { Properties tableProperties = new Properties(); - tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS, "column_with_bloom_filter"); - tableProperties.setProperty(ORC_BLOOM_FILTER_FPP, fpp); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "column_with_bloom_filter"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, fpp); assertThatThrownBy(() -> getOrcWriterOptions(tableProperties, new OrcWriterOptions())) .hasMessage("bloomFilterFpp should be > 0.0 & < 1.0"); }