From 63f886655e5f2f7362d55b7f09bc6dfdcfda3213 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Wed, 22 Feb 2023 21:12:03 -0800 Subject: [PATCH] Add REGEX to storage formats and missing table properties --- .../io/trino/plugin/hive/HiveMetadata.java | 39 +++++ .../java/io/trino/plugin/hive/HiveModule.java | 2 + .../trino/plugin/hive/HiveStorageFormat.java | 6 + .../plugin/hive/HiveTableProperties.java | 14 ++ .../hive/line/RegexFileWriterFactory.java | 55 +++++++ .../plugin/hive/util/HiveClassNames.java | 1 + .../trino/plugin/hive/AbstractTestHive.java | 5 +- .../hive/AbstractTestHiveFileSystem.java | 4 + .../plugin/hive/BaseHiveConnectorTest.java | 9 ++ .../io/trino/plugin/hive/HiveTestUtils.java | 2 + .../trino/plugin/hive/TestHivePageSink.java | 4 + .../io/trino/plugin/hive/TestRegexTable.java | 144 ++++++++++++++++++ .../product/hive/TestHiveStorageFormats.java | 2 + 13 files changed, 285 insertions(+), 2 deletions(-) create mode 100644 plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexFileWriterFactory.java create mode 100644 plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestRegexTable.java diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java index 4e6ce6340c8c..7b5c8e8c75c4 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java @@ -153,6 +153,8 @@ import java.util.concurrent.CompletableFuture; import java.util.function.Function; import java.util.function.Supplier; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.stream.Stream; import static com.google.common.base.MoreObjects.firstNonNull; @@ -223,6 +225,8 @@ import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS; import static io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_FPP; import static io.trino.plugin.hive.HiveTableProperties.PARTITIONED_BY_PROPERTY; +import static io.trino.plugin.hive.HiveTableProperties.REGEX_CASE_INSENSITIVE; +import static io.trino.plugin.hive.HiveTableProperties.REGEX_PATTERN; import static io.trino.plugin.hive.HiveTableProperties.SKIP_FOOTER_LINE_COUNT; import static io.trino.plugin.hive.HiveTableProperties.SKIP_HEADER_LINE_COUNT; import static io.trino.plugin.hive.HiveTableProperties.SORTED_BY_PROPERTY; @@ -240,7 +244,9 @@ import static io.trino.plugin.hive.HiveTableProperties.getOrcBloomFilterColumns; import static io.trino.plugin.hive.HiveTableProperties.getOrcBloomFilterFpp; import static io.trino.plugin.hive.HiveTableProperties.getPartitionedBy; +import static io.trino.plugin.hive.HiveTableProperties.getRegexPattern; import static io.trino.plugin.hive.HiveTableProperties.getSingleCharacterProperty; +import static io.trino.plugin.hive.HiveTableProperties.isRegexCaseInsensitive; import static io.trino.plugin.hive.HiveTableProperties.isTransactional; import static io.trino.plugin.hive.HiveTimestampPrecision.NANOSECONDS; import static io.trino.plugin.hive.HiveType.HIVE_STRING; @@ -357,6 +363,9 @@ public class HiveMetadata private static final String CSV_QUOTE_KEY = "quoteChar"; private static final String CSV_ESCAPE_KEY = "escapeChar"; + private static final String REGEX_KEY = "input.regex"; + private static final String REGEX_CASE_SENSITIVE_KEY = "input.regex.case.insensitive"; + private static final String AUTO_PURGE_KEY = "auto.purge"; public static final String MODIFYING_NON_TRANSACTIONAL_TABLE_MESSAGE = "Modifying Hive table rows is only supported for transactional tables"; @@ -685,6 +694,12 @@ private ConnectorTableMetadata doGetTableMetadata(ConnectorSession session, Sche getCsvSerdeProperty(table, CSV_ESCAPE_KEY) .ifPresent(csvEscape -> properties.put(CSV_ESCAPE, csvEscape)); + // REGEX specific properties + getSerdeProperty(table, REGEX_KEY) + .ifPresent(regex -> properties.put(REGEX_PATTERN, regex)); + getSerdeProperty(table, REGEX_CASE_SENSITIVE_KEY) + .ifPresent(regexCaseInsensitive -> properties.put(REGEX_CASE_INSENSITIVE, parseBoolean(regexCaseInsensitive))); + Optional comment = Optional.ofNullable(table.getParameters().get(TABLE_COMMENT)); String autoPurgeProperty = table.getParameters().get(AUTO_PURGE_KEY); @@ -1094,6 +1109,30 @@ else if (avroSchemaLiteral != null) { tableProperties.put(CSV_SEPARATOR_KEY, separator.toString()); }); + // REGEX specific properties + getRegexPattern(tableMetadata.getProperties()) + .ifPresentOrElse( + regexPattern -> { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.REGEX, REGEX_PATTERN); + try { + Pattern.compile(regexPattern); + } + catch (PatternSyntaxException e) { + throw new TrinoException(INVALID_TABLE_PROPERTY, "Invalid REGEX pattern value: " + regexPattern); + } + tableProperties.put(REGEX_KEY, regexPattern); + }, + () -> { + if (hiveStorageFormat == HiveStorageFormat.REGEX) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("REGEX format requires the '%s' table property", REGEX_PATTERN)); + } + }); + isRegexCaseInsensitive(tableMetadata.getProperties()) + .ifPresent(regexCaseInsensitive -> { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.REGEX, REGEX_CASE_INSENSITIVE); + tableProperties.put(REGEX_CASE_SENSITIVE_KEY, String.valueOf(regexCaseInsensitive)); + }); + // Set bogus table stats to prevent Hive 2.x from gathering these stats at table creation. // These stats are not useful by themselves and can take very long time to collect when creating an // external table over large data set. diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java index c9567fb90a8b..b7b648165a20 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java @@ -30,6 +30,7 @@ import io.trino.plugin.hive.line.CsvPageSourceFactory; import io.trino.plugin.hive.line.JsonFileWriterFactory; import io.trino.plugin.hive.line.JsonPageSourceFactory; +import io.trino.plugin.hive.line.RegexFileWriterFactory; import io.trino.plugin.hive.line.RegexPageSourceFactory; import io.trino.plugin.hive.line.SimpleSequenceFilePageSourceFactory; import io.trino.plugin.hive.line.SimpleSequenceFileWriterFactory; @@ -147,6 +148,7 @@ public void configure(Binder binder) configBinder(binder).bindConfig(OrcWriterConfig.class); fileWriterFactoryBinder.addBinding().to(CsvFileWriterFactory.class).in(Scopes.SINGLETON); fileWriterFactoryBinder.addBinding().to(JsonFileWriterFactory.class).in(Scopes.SINGLETON); + fileWriterFactoryBinder.addBinding().to(RegexFileWriterFactory.class).in(Scopes.SINGLETON); fileWriterFactoryBinder.addBinding().to(SimpleTextFileWriterFactory.class).in(Scopes.SINGLETON); fileWriterFactoryBinder.addBinding().to(SimpleSequenceFileWriterFactory.class).in(Scopes.SINGLETON); fileWriterFactoryBinder.addBinding().to(OrcFileWriterFactory.class).in(Scopes.SINGLETON); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java index c44baf6a765d..a239f8f784a8 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java @@ -49,6 +49,7 @@ import static io.trino.plugin.hive.util.HiveClassNames.PARQUET_HIVE_SERDE_CLASS; import static io.trino.plugin.hive.util.HiveClassNames.RCFILE_INPUT_FORMAT_CLASS; import static io.trino.plugin.hive.util.HiveClassNames.RCFILE_OUTPUT_FORMAT_CLASS; +import static io.trino.plugin.hive.util.HiveClassNames.REGEX_HIVE_SERDE_CLASS; import static io.trino.plugin.hive.util.HiveClassNames.SEQUENCEFILE_INPUT_FORMAT_CLASS; import static io.trino.plugin.hive.util.HiveClassNames.TEXT_INPUT_FORMAT_CLASS; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; @@ -101,6 +102,11 @@ public enum HiveStorageFormat OPENCSV_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, + DataSize.of(8, Unit.MEGABYTE)), + REGEX( + REGEX_HIVE_SERDE_CLASS, + TEXT_INPUT_FORMAT_CLASS, + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, DataSize.of(8, Unit.MEGABYTE)); private final String serde; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveTableProperties.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveTableProperties.java index ed505a19e584..328f494cbbfc 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveTableProperties.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveTableProperties.java @@ -65,6 +65,8 @@ public class HiveTableProperties public static final String CSV_SEPARATOR = "csv_separator"; public static final String CSV_QUOTE = "csv_quote"; public static final String CSV_ESCAPE = "csv_escape"; + public static final String REGEX_PATTERN = "regex"; + public static final String REGEX_CASE_INSENSITIVE = "regex_case_insensitive"; public static final String TRANSACTIONAL = "transactional"; public static final String AUTO_PURGE = "auto_purge"; @@ -153,6 +155,8 @@ public HiveTableProperties( stringProperty(CSV_SEPARATOR, "CSV separator character", null, false), stringProperty(CSV_QUOTE, "CSV quote character", null, false), stringProperty(CSV_ESCAPE, "CSV escape character", null, false), + stringProperty(REGEX_PATTERN, "REGEX pattern", null, false), + booleanProperty(REGEX_CASE_INSENSITIVE, "REGEX pattern is case insensitive", null, false), booleanProperty(TRANSACTIONAL, "Table is transactional", null, false), booleanProperty(AUTO_PURGE, "Skip trash when table or partition is deleted", config.isAutoPurge(), false), booleanProperty( @@ -288,6 +292,16 @@ public static Optional getSingleCharacterProperty(Map return Optional.of(stringValue.charAt(0)); } + public static Optional getRegexPattern(Map tableProperties) + { + return Optional.ofNullable((String) tableProperties.get(REGEX_PATTERN)); + } + + public static Optional isRegexCaseInsensitive(Map tableProperties) + { + return Optional.ofNullable((Boolean) tableProperties.get(REGEX_CASE_INSENSITIVE)); + } + public static Optional isTransactional(Map tableProperties) { return Optional.ofNullable((Boolean) tableProperties.get(TRANSACTIONAL)); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexFileWriterFactory.java new file mode 100644 index 000000000000..2eaeda18c5ae --- /dev/null +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexFileWriterFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.line; + +import io.trino.plugin.hive.FileWriter; +import io.trino.plugin.hive.HiveFileWriterFactory; +import io.trino.plugin.hive.WriterKind; +import io.trino.plugin.hive.acid.AcidTransaction; +import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; + +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR; +import static io.trino.plugin.hive.util.HiveClassNames.REGEX_HIVE_SERDE_CLASS; + +public class RegexFileWriterFactory + implements HiveFileWriterFactory +{ + @Override + public Optional createFileWriter( + Path path, + List inputColumnNames, + StorageFormat storageFormat, + Properties schema, + JobConf configuration, + ConnectorSession session, + OptionalInt bucketNumber, + AcidTransaction transaction, + boolean useAcidSchema, + WriterKind writerKind) + { + if (REGEX_HIVE_SERDE_CLASS.equals(storageFormat.getSerde())) { + throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "REGEX format is read-only"); + } + return Optional.empty(); + } +} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveClassNames.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveClassNames.java index 149f05d28fba..5d53b58092f0 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveClassNames.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveClassNames.java @@ -35,6 +35,7 @@ public final class HiveClassNames public static final String PARQUET_HIVE_SERDE_CLASS = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"; public static final String RCFILE_INPUT_FORMAT_CLASS = "org.apache.hadoop.hive.ql.io.RCFileInputFormat"; public static final String RCFILE_OUTPUT_FORMAT_CLASS = "org.apache.hadoop.hive.ql.io.RCFileOutputFormat"; + public static final String REGEX_HIVE_SERDE_CLASS = "org.apache.hadoop.hive.serde2.RegexSerDe"; public static final String SEQUENCEFILE_INPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.SequenceFileInputFormat"; public static final String SYMLINK_TEXT_INPUT_FORMAT_CLASS = "org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat"; public static final String TEXT_INPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.TextInputFormat"; diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java index 7a793c320e2c..4db4b789d389 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java @@ -226,6 +226,7 @@ import static io.trino.plugin.hive.HiveStorageFormat.PARQUET; import static io.trino.plugin.hive.HiveStorageFormat.RCBINARY; import static io.trino.plugin.hive.HiveStorageFormat.RCTEXT; +import static io.trino.plugin.hive.HiveStorageFormat.REGEX; import static io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE; import static io.trino.plugin.hive.HiveStorageFormat.TEXTFILE; import static io.trino.plugin.hive.HiveTableProperties.BUCKETED_BY_PROPERTY; @@ -503,8 +504,8 @@ private static RowType toRowType(List columns) protected Set createTableFormats = difference( ImmutableSet.copyOf(HiveStorageFormat.values()), - // exclude formats that change table schema with serde - ImmutableSet.of(AVRO, CSV)); + // exclude formats that change table schema with serde and read-only formats + ImmutableSet.of(AVRO, CSV, REGEX)); private static final TypeOperators TYPE_OPERATORS = new TypeOperators(); private static final BlockTypeOperators BLOCK_TYPE_OPERATORS = new BlockTypeOperators(TYPE_OPERATORS); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java index 2471f1c9fbc1..9f17bce3e35f 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java @@ -496,6 +496,10 @@ public void testTableCreation() // CSV supports only unbounded VARCHAR type continue; } + if (storageFormat == HiveStorageFormat.REGEX) { + // REGEX format is read-only + continue; + } createTable(temporaryCreateTable, storageFormat); dropTable(temporaryCreateTable); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java index 7d0dee8a3387..812cb8cfb1a2 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java @@ -120,6 +120,7 @@ import static io.trino.plugin.hive.HiveQueryRunner.HIVE_CATALOG; import static io.trino.plugin.hive.HiveQueryRunner.TPCH_SCHEMA; import static io.trino.plugin.hive.HiveQueryRunner.createBucketedSession; +import static io.trino.plugin.hive.HiveStorageFormat.REGEX; import static io.trino.plugin.hive.HiveTableProperties.AUTO_PURGE; import static io.trino.plugin.hive.HiveTableProperties.BUCKETED_BY_PROPERTY; import static io.trino.plugin.hive.HiveTableProperties.BUCKET_COUNT_PROPERTY; @@ -2055,6 +2056,10 @@ public void testEmptyBucketedTable() { // create empty bucket files for all storage formats and compression codecs for (HiveStorageFormat storageFormat : HiveStorageFormat.values()) { + if (storageFormat == REGEX) { + // REGEX format is readonly + continue; + } for (HiveCompressionCodec compressionCodec : HiveCompressionCodec.values()) { if ((storageFormat == HiveStorageFormat.AVRO) && (compressionCodec == HiveCompressionCodec.LZ4)) { continue; @@ -8588,6 +8593,10 @@ private List getAllTestingHiveStorageFormat() // CSV supports only unbounded VARCHAR type continue; } + if (hiveStorageFormat == REGEX) { + // REGEX format is read-only + continue; + } if (hiveStorageFormat == HiveStorageFormat.PARQUET) { formats.add(new TestingHiveStorageFormat( Session.builder(session) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java index dc749819e2ba..c7d2aaf43c04 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java @@ -35,6 +35,7 @@ import io.trino.plugin.hive.line.CsvPageSourceFactory; import io.trino.plugin.hive.line.JsonFileWriterFactory; import io.trino.plugin.hive.line.JsonPageSourceFactory; +import io.trino.plugin.hive.line.RegexFileWriterFactory; import io.trino.plugin.hive.line.RegexPageSourceFactory; import io.trino.plugin.hive.line.SimpleSequenceFilePageSourceFactory; import io.trino.plugin.hive.line.SimpleSequenceFileWriterFactory; @@ -224,6 +225,7 @@ public static Set getDefaultHiveFileWriterFactories(HiveC return ImmutableSet.builder() .add(new CsvFileWriterFactory(fileSystemFactory, TESTING_TYPE_MANAGER)) .add(new JsonFileWriterFactory(fileSystemFactory, TESTING_TYPE_MANAGER)) + .add(new RegexFileWriterFactory()) .add(new SimpleTextFileWriterFactory(fileSystemFactory, TESTING_TYPE_MANAGER)) .add(new SimpleSequenceFileWriterFactory(fileSystemFactory, TESTING_TYPE_MANAGER, nodeVersion)) .add(new RcFileFileWriterFactory(hdfsEnvironment, TESTING_TYPE_MANAGER, nodeVersion, hiveConfig)) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java index a4a77a9ecb3c..b0a1ab0bf899 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java @@ -116,6 +116,10 @@ public void testAllFormats() // CSV supports only unbounded VARCHAR type, which is not provided by lineitem continue; } + if (format == HiveStorageFormat.REGEX) { + // REGEX format is readonly + continue; + } config.setHiveStorageFormat(format); config.setHiveCompressionCodec(NONE); long uncompressedLength = writeTestFile(config, metastore, makeFileName(tempDir, config)); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestRegexTable.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestRegexTable.java new file mode 100644 index 000000000000..d7bd4d5a9d9a --- /dev/null +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestRegexTable.java @@ -0,0 +1,144 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.MaterializedResult; +import io.trino.testing.QueryRunner; +import org.intellij.lang.annotations.Language; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.nio.file.Path; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; +import static java.nio.file.Files.createTempDirectory; + +public class TestRegexTable + extends AbstractTestQueryFramework +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return HiveQueryRunner.builder() + .setHiveProperties(ImmutableMap.of("hive.non-managed-table-writes-enabled", "true")) + .build(); + } + + @Test + public void testCreateExternalTableWithData() + throws IOException + { + Path tempDir = createTempDirectory(null); + Path tableLocation = tempDir.resolve("data"); + + // REGEX format is read-only, so create data files using the text file format + @Language("SQL") String createTableSql = """ + CREATE TABLE test_regex_data + WITH ( + format = 'textfile', + textfile_field_separator = 'x', + external_location = '%s') + AS SELECT nationkey, name FROM tpch.tiny.nation + """.formatted(tableLocation.toUri().toASCIIString()); + assertUpdate(createTableSql, 25); + + MaterializedResult expected = computeActual("SELECT nationkey, name FROM tpch.tiny.nation"); + MaterializedResult actual = computeActual("SELECT nationkey, name FROM test_regex_data"); + assertEqualsIgnoreOrder(actual.getMaterializedRows(), expected.getMaterializedRows()); + + // REGEX table over the text file created data + createTableSql = """ + CREATE TABLE test_regex ( + nationkey BIGINT, + name VARCHAR) + WITH ( + format = 'regex', + regex = '(\\d+)x(.+)', + external_location = '%s') + """.formatted(tableLocation.toUri().toASCIIString()); + assertUpdate(createTableSql); + + actual = computeActual("SELECT nationkey, name FROM test_regex"); + assertEqualsIgnoreOrder(actual.getMaterializedRows(), expected.getMaterializedRows()); + + // Verify REGEX read-only is enforced + assertQueryFails("INSERT INTO test_regex VALUES (42, 'name')", "REGEX format is read-only"); + + // case insensitive + assertUpdate("DROP TABLE test_regex"); + createTableSql = """ + CREATE TABLE test_regex ( + nationkey BIGINT, + name VARCHAR) + WITH ( + format = 'regex', + regex = '(\\d+)X(.+)', + regex_case_insensitive = true, + external_location = '%s') + """.formatted(tableLocation.toUri().toASCIIString()); + assertUpdate(createTableSql); + actual = computeActual("SELECT nationkey, name FROM test_regex"); + assertEqualsIgnoreOrder(actual.getMaterializedRows(), expected.getMaterializedRows()); + + // case-sensitive with no-match + assertUpdate("DROP TABLE test_regex"); + createTableSql = """ + CREATE TABLE test_regex ( + nationkey BIGINT, + name VARCHAR) + WITH ( + format = 'regex', + regex = '(\\d+)X(.+)', + external_location = '%s') + """.formatted(tableLocation.toUri().toASCIIString()); + assertUpdate(createTableSql); + // when the pattern does not match all columns are null + assertQueryReturnsEmptyResult("SELECT nationkey, name FROM test_regex WHERE nationkey IS NOT NULL AND name IS NOT NULL"); + + assertUpdate("DROP TABLE test_regex"); + assertUpdate("DROP TABLE test_regex_data"); + deleteRecursively(tempDir, ALLOW_INSECURE); + } + + @Test + public void testRegexPropertyIsRequired() + { + assertQueryFails(""" + CREATE TABLE test_regex_property_required ( + nationkey BIGINT, + name VARCHAR) + WITH (format = 'regex') + """, + "REGEX format requires the 'regex' table property"); + } + + @Test + public void testInvalidRegexProperty() + { + assertQueryFails(""" + CREATE TABLE test_regex_property_required ( + nationkey BIGINT, + name VARCHAR) + WITH ( + format = 'regex', + regex = '\\J') + """, + "Invalid REGEX pattern value: \\\\J"); + } +} diff --git a/testing/trino-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveStorageFormats.java b/testing/trino-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveStorageFormats.java index 277f962f5db3..609245952289 100644 --- a/testing/trino-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveStorageFormats.java +++ b/testing/trino-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveStorageFormats.java @@ -304,6 +304,8 @@ public void verifyDataProviderCompleteness() Set allFormatsToTest = allFormats.stream() // Hive CSV storage format only supports VARCHAR, so needs to be excluded from any generic tests .filter(format -> !"CSV".equals(format)) + // REGEX is read-only + .filter(format -> !"REGEX".equals(format)) // TODO when using JSON serde Hive fails with ClassNotFoundException: org.apache.hive.hcatalog.data.JsonSerDe .filter(format -> !"JSON".equals(format)) .collect(toImmutableSet());