diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/ColumnInfo.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/ColumnInfo.java index 9bd28af238b4..81e0964fa397 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/ColumnInfo.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/ColumnInfo.java @@ -13,9 +13,11 @@ */ package io.trino.plugin.faker; +import com.google.common.collect.ImmutableMap; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.type.Type; +import java.util.Map; import java.util.Optional; import static java.util.Objects.requireNonNull; @@ -57,4 +59,16 @@ public ColumnInfo withComment(Optional comment) .setComment(comment) .build()); } + + public ColumnInfo withHandle(FakerColumnHandle handle) + { + return new ColumnInfo(handle, metadata); + } + + public ColumnInfo withProperties(Map properties) + { + return new ColumnInfo(handle, ColumnMetadata.builderFrom(metadata) + .setProperties(ImmutableMap.copyOf(properties)) + .build()); + } } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerColumnHandle.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerColumnHandle.java index c7af196596ca..4b81b75858ca 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerColumnHandle.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerColumnHandle.java @@ -174,4 +174,19 @@ private static List strings(Collection values) .map(String.class::cast) .collect(toImmutableList()); } + + public FakerColumnHandle withNullProbability(double nullProbability) + { + return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step); + } + + public FakerColumnHandle withDomain(Domain domain) + { + return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step); + } + + public FakerColumnHandle withStep(ValueSet step) + { + return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step); + } } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java index e4e69f5d6004..070a1e5753d7 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConfig.java @@ -26,6 +26,8 @@ public class FakerConfig private double nullProbability = 0.5; private long defaultLimit = 1000L; private Locale locale = Locale.ENGLISH; + private boolean sequenceDetectionEnabled = true; + private boolean dictionaryDetectionEnabled = true; @Max(1) @Min(0) @@ -68,4 +70,36 @@ public FakerConfig setLocale(String value) this.locale = new Locale.Builder().setLanguageTag(value).build(); return this; } + + public boolean isSequenceDetectionEnabled() + { + return sequenceDetectionEnabled; + } + + @Config("faker.sequence-detection-enabled") + @ConfigDescription( + """ + If true, when creating a table using existing data, columns with the number of distinct values close to + the number of rows will be treated as sequences""") + public FakerConfig setSequenceDetectionEnabled(boolean value) + { + this.sequenceDetectionEnabled = value; + return this; + } + + public boolean isDictionaryDetectionEnabled() + { + return dictionaryDetectionEnabled; + } + + @Config("faker.dictionary-detection-enabled") + @ConfigDescription( + """ + If true, when creating a table using existing data, columns with a low number of distinct values + will have the allowed_values column property populated with random values""") + public FakerConfig setDictionaryDetectionEnabled(boolean value) + { + this.dictionaryDetectionEnabled = value; + return this; + } } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java index 734fa513b355..322fbfa006ea 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerConnector.java @@ -44,6 +44,7 @@ import static io.trino.spi.StandardErrorCode.INVALID_SCHEMA_PROPERTY; import static io.trino.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; import static io.trino.spi.connector.ConnectorCapabilities.NOT_NULL_COLUMN_CONSTRAINT; +import static io.trino.spi.session.PropertyMetadata.booleanProperty; import static io.trino.spi.session.PropertyMetadata.doubleProperty; import static io.trino.spi.session.PropertyMetadata.longProperty; import static io.trino.spi.session.PropertyMetadata.stringProperty; @@ -125,6 +126,20 @@ public List> getSchemaProperties() "Default limit of rows returned from any table in this schema, if not specified in the query", null, defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_SCHEMA_PROPERTY, "default_limit value must be equal or greater than 1"), + false), + booleanProperty( + SchemaInfo.SEQUENCE_DETECTION_ENABLED, + """ + If true, when creating a table using existing data, columns with the number of distinct values close to + the number of rows will be treated as sequences""", + null, + false), + booleanProperty( + SchemaInfo.DICTIONARY_DETECTION_ENABLED, + """ + If true, when creating a table using existing data, columns with a low number of distinct values + will have the allowed_values column property populated with random values""", + null, false)); } @@ -143,6 +158,20 @@ public List> getTableProperties() "Default limit of rows returned from this table if not specified in the query", null, defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_TABLE_PROPERTY, "default_limit value must be equal or greater than 1"), + false), + booleanProperty( + TableInfo.SEQUENCE_DETECTION_ENABLED, + """ + If true, when creating a table using existing data, columns with the number of distinct values close to + the number of rows will be treated as sequences""", + null, + false), + booleanProperty( + TableInfo.DICTIONARY_DETECTION_ENABLED, + """ + If true, when creating a table using existing data, columns with a low number of distinct values + will have the allowed_values column property populated with random values""", + null, false)); } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerMetadata.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerMetadata.java index 8dbcacb58195..a8ae2f291fd7 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerMetadata.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerMetadata.java @@ -18,7 +18,9 @@ import com.google.common.collect.ImmutableMap; import com.google.errorprone.annotations.concurrent.GuardedBy; import io.airlift.slice.Slice; +import io.trino.spi.Page; import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.connector.ConnectorMetadata; @@ -46,9 +48,15 @@ import io.trino.spi.predicate.Range; import io.trino.spi.predicate.ValueSet; import io.trino.spi.security.TrinoPrincipal; +import io.trino.spi.statistics.ColumnStatisticMetadata; import io.trino.spi.statistics.ComputedStatistics; -import io.trino.spi.type.BigintType; +import io.trino.spi.statistics.TableStatisticsMetadata; +import io.trino.spi.type.CharType; +import io.trino.spi.type.Type; +import io.trino.spi.type.VarbinaryType; +import io.trino.spi.type.VarcharType; import jakarta.inject.Inject; +import net.datafaker.Faker; import java.util.ArrayList; import java.util.Collection; @@ -56,17 +64,27 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; +import java.util.Random; import java.util.Set; import java.util.function.UnaryOperator; -import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; +import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.filterKeys; +import static io.trino.plugin.faker.ColumnInfo.ALLOWED_VALUES_PROPERTY; +import static io.trino.plugin.faker.ColumnInfo.MAX_PROPERTY; +import static io.trino.plugin.faker.ColumnInfo.MIN_PROPERTY; +import static io.trino.plugin.faker.ColumnInfo.NULL_PROBABILITY_PROPERTY; +import static io.trino.plugin.faker.ColumnInfo.STEP_PROPERTY; import static io.trino.spi.StandardErrorCode.ALREADY_EXISTS; import static io.trino.spi.StandardErrorCode.INVALID_COLUMN_REFERENCE; import static io.trino.spi.StandardErrorCode.NOT_FOUND; @@ -75,6 +93,16 @@ import static io.trino.spi.StandardErrorCode.SCHEMA_NOT_FOUND; import static io.trino.spi.StandardErrorCode.TABLE_ALREADY_EXISTS; import static io.trino.spi.connector.RetryMode.NO_RETRIES; +import static io.trino.spi.statistics.ColumnStatisticType.MAX_VALUE; +import static io.trino.spi.statistics.ColumnStatisticType.MIN_VALUE; +import static io.trino.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; +import static io.trino.spi.statistics.ColumnStatisticType.NUMBER_OF_NON_NULL_VALUES; +import static io.trino.spi.statistics.TableStatisticType.ROW_COUNT; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TinyintType.TINYINT; +import static io.trino.spi.type.TypeUtils.readNativeValue; import static java.lang.String.format; import static java.util.Objects.requireNonNull; @@ -83,13 +111,20 @@ public class FakerMetadata { public static final String SCHEMA_NAME = "default"; public static final String ROW_ID_COLUMN_NAME = "$row_id"; + public static final double MIN_SEQUENCE_RATIO = 0.98; + public static final long MAX_DICTIONARY_SIZE = 1000L; @GuardedBy("this") private final List schemas = new ArrayList<>(); private final double nullProbability; private final long defaultLimit; + private final boolean isSequenceDetectionEnabled; + private final boolean isDictionaryDetectionEnabled; private final FakerFunctionProvider functionsProvider; + private final Random random; + private final Faker faker; + @GuardedBy("this") private final Map tables = new HashMap<>(); @GuardedBy("this") @@ -101,7 +136,11 @@ public FakerMetadata(FakerConfig config, FakerFunctionProvider functionProvider) this.schemas.add(new SchemaInfo(SCHEMA_NAME, Map.of())); this.nullProbability = config.getNullProbability(); this.defaultLimit = config.getDefaultLimit(); + this.isSequenceDetectionEnabled = config.isSequenceDetectionEnabled(); + this.isDictionaryDetectionEnabled = config.isDictionaryDetectionEnabled(); this.functionsProvider = requireNonNull(functionProvider, "functionProvider is null"); + this.random = new Random(1); + this.faker = new Faker(random); } @Override @@ -264,7 +303,7 @@ public synchronized void setTableProperties(ConnectorSession session, ConnectorT properties.entrySet().stream() .filter(entry -> entry.getValue().isPresent()) .map(entry -> Map.entry(entry.getKey(), entry.getValue().get()))) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); tables.put(tableName, oldInfo.withProperties(newProperties)); } @@ -336,14 +375,14 @@ public synchronized FakerOutputTableHandle beginCreateTable(ConnectorSession ses new FakerColumnHandle( columnId, ROW_ID_COLUMN_NAME, - BigintType.BIGINT, + BIGINT, 0, "", - Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(BigintType.BIGINT, 0L)), false), - ValueSet.of(BigintType.BIGINT, 1L)), + Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(BIGINT, 0L)), false), + ValueSet.of(BIGINT, 1L)), ColumnMetadata.builder() .setName(ROW_ID_COLUMN_NAME) - .setType(BigintType.BIGINT) + .setType(BIGINT) .setHidden(true) .setNullable(false) .build())); @@ -356,6 +395,16 @@ public synchronized FakerOutputTableHandle beginCreateTable(ConnectorSession ses return new FakerOutputTableHandle(tableName); } + private static boolean isNotRangeType(Type type) + { + return type instanceof CharType || type instanceof VarcharType || type instanceof VarbinaryType; + } + + private static boolean isSequenceType(Type type) + { + return BIGINT.equals(type) || INTEGER.equals(type) || SMALLINT.equals(type) || TINYINT.equals(type); + } + private synchronized void checkSchemaExists(String schemaName) { if (schemas.stream().noneMatch(schema -> schema.name().equals(schemaName))) { @@ -392,11 +441,170 @@ public synchronized Optional finishCreateTable( TableInfo info = tables.get(tableName); requireNonNull(info, "info is null"); - tables.put(tableName, new TableInfo(info.columns(), info.properties(), info.comment())); + tables.put(tableName, createTableInfoFromStats(tableName, info, computedStatistics)); return Optional.empty(); } + private synchronized TableInfo createTableInfoFromStats(SchemaTableName tableName, TableInfo info, Collection computedStatistics) + { + if (computedStatistics.isEmpty()) { + return info; + } + ImmutableMap.Builder minimumsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder maximumsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder distinctValuesBuilder = ImmutableMap.builder(); + ImmutableMap.Builder nonNullValuesBuilder = ImmutableMap.builder(); + List columns = info.columns(); + Map types = columns.stream().collect(toImmutableMap(ColumnInfo::name, ColumnInfo::type)); + Long rowCount = null; + Optional optionalStatistic = computedStatistics.stream().reduce((_, _) -> { + throw new IllegalStateException("Found more than one computed statistic"); + }); + if (optionalStatistic.isPresent()) { + ComputedStatistics statistic = optionalStatistic.get(); + if (!statistic.getTableStatistics().get(ROW_COUNT).isNull(0)) { + rowCount = BIGINT.getLong(statistic.getTableStatistics().get(ROW_COUNT), 0); + } + statistic.getColumnStatistics().forEach((metadata, block) -> { + checkState(block.getPositionCount() == 1, "Expected a single position in aggregation result block"); + String columnName = metadata.getColumnName(); + Type type = types.get(columnName); + if (block.isNull(0) || type == null) { + return; + } + switch (metadata.getStatisticType()) { + case MIN_VALUE -> minimumsBuilder.put(columnName, requireNonNull(readNativeValue(type, block, 0))); + case MAX_VALUE -> maximumsBuilder.put(columnName, requireNonNull(readNativeValue(type, block, 0))); + case NUMBER_OF_DISTINCT_VALUES -> distinctValuesBuilder.put(columnName, BIGINT.getLong(block, 0)); + case NUMBER_OF_NON_NULL_VALUES -> nonNullValuesBuilder.put(columnName, BIGINT.getLong(block, 0)); + default -> throw new IllegalArgumentException("Unexpected column statistic type: " + metadata.getStatisticType()); + } + }); + } + Map minimums = minimumsBuilder.buildOrThrow(); + Map maximums = maximumsBuilder.buildOrThrow(); + Map distinctValues = distinctValuesBuilder.buildOrThrow(); + Map nonNullValues = info.properties().containsKey(TableInfo.NULL_PROBABILITY_PROPERTY) ? ImmutableMap.of() : nonNullValuesBuilder.buildOrThrow(); + + if (!info.properties().containsKey(TableInfo.DEFAULT_LIMIT_PROPERTY) && rowCount != null) { + info = info.withProperties(ImmutableMap.builder() + .putAll(info.properties()) + .put(TableInfo.DEFAULT_LIMIT_PROPERTY, rowCount) + .buildOrThrow()); + } + + long finalRowCount = firstNonNull(rowCount, 1L); + SchemaInfo schema = getSchema(tableName.getSchemaName()); + boolean isSchemaSequenceDetectionEnabled = (boolean) schema.properties().getOrDefault(SchemaInfo.SEQUENCE_DETECTION_ENABLED, isSequenceDetectionEnabled); + boolean isTableSequenceDetectionEnabled = (boolean) info.properties().getOrDefault(TableInfo.SEQUENCE_DETECTION_ENABLED, isSchemaSequenceDetectionEnabled); + Map> columnValues = getColumnValues(tableName, info, distinctValues, minimums, maximums); + return info.withColumns(columns.stream().map(column -> createColumnInfoFromStats( + column, + minimums.get(column.name()), + maximums.get(column.name()), + requireNonNull(distinctValues.getOrDefault(column.name(), 0L)), + Optional.ofNullable(nonNullValues.get(column.name())), + finalRowCount, + isTableSequenceDetectionEnabled, + columnValues.get(column.name()))) + .collect(toImmutableList())); + } + + private static ColumnInfo createColumnInfoFromStats(ColumnInfo column, Object min, Object max, long distinctValues, Optional nonNullValues, long rowCount, boolean isSequenceDetectionEnabled, List allowedValues) + { + if (isNotRangeType(column.type())) { + return column; + } + FakerColumnHandle handle = column.handle(); + Map properties = new HashMap<>(column.metadata().getProperties()); + if (allowedValues != null) { + handle = handle.withDomain(Domain.create(ValueSet.copyOf(column.type(), allowedValues), false)); + properties.put(ALLOWED_VALUES_PROPERTY, allowedValues.stream() + .map(value -> Literal.format(column.type(), value)) + .collect(toImmutableList())); + } + else if (min != null && max != null) { + handle = handle.withDomain(Domain.create(ValueSet.ofRanges(Range.range(column.type(), min, true, max, true)), false)); + properties.put(MIN_PROPERTY, Literal.format(column.type(), min)); + properties.put(MAX_PROPERTY, Literal.format(column.type(), max)); + } + if (nonNullValues.isPresent()) { + double nullProbability = 1 - (double) nonNullValues.get() / rowCount; + handle = handle.withNullProbability(nullProbability); + properties.put(NULL_PROBABILITY_PROPERTY, nullProbability); + } + // Only include types that support generating sequences in FakerPageSource, + // but don't include types with configurable precision, dates, or intervals. + // The number of distinct values is an approximation, so compare it with a margin. + if (isSequenceDetectionEnabled && isSequenceType(column.type()) && (double) distinctValues / rowCount >= MIN_SEQUENCE_RATIO) { + handle = handle.withStep(ValueSet.of(column.type(), 1L)); + properties.put(STEP_PROPERTY, "1"); + } + return column + .withHandle(handle) + .withProperties(properties); + } + + private Map> getColumnValues(SchemaTableName tableName, TableInfo info, Map distinctValues, Map minimums, Map maximums) + { + boolean schemaDictionaryDetectionEnabled = (boolean) getSchema(tableName.getSchemaName()).properties().getOrDefault(SchemaInfo.DICTIONARY_DETECTION_ENABLED, isDictionaryDetectionEnabled); + boolean tableDictionaryDetectionEnabled = (boolean) info.properties().getOrDefault(TableInfo.DICTIONARY_DETECTION_ENABLED, schemaDictionaryDetectionEnabled); + if (!tableDictionaryDetectionEnabled || distinctValues.isEmpty()) { + return ImmutableMap.of(); + } + List columns = info.columns(); + Map columnHandles = info.columns().stream() + .collect(toImmutableMap(ColumnInfo::name, column -> column.handle().withNullProbability(0))); + List dictionaryColumns = distinctValues.entrySet().stream() + .filter(entry -> entry.getValue() <= MAX_DICTIONARY_SIZE) + .map(entry -> columnHandles.get(entry.getKey())) + .filter(Objects::nonNull) + .map(column -> !minimums.containsKey(column.name()) ? column : column.withDomain(Domain.create(ValueSet.ofRanges(Range.range( + column.type(), + minimums.get(column.name()), + true, + maximums.get(column.name()), + true)), false))) + .collect(toImmutableList()); + ImmutableMap.Builder> columnValues = ImmutableMap.builder(); + try (FakerPageSource pageSource = new FakerPageSource(faker, random, dictionaryColumns, 0, MAX_DICTIONARY_SIZE * 2)) { + Page page = null; + while (page == null) { + page = pageSource.getNextPage(); + } + Map types = columns.stream().collect(toImmutableMap(ColumnInfo::name, ColumnInfo::type)); + for (int channel = 0; channel < dictionaryColumns.size(); channel++) { + String column = dictionaryColumns.get(channel).name(); + Block block = page.getBlock(channel); + Type type = types.get(column); + List values = IntStream.range(0, page.getPositionCount()) + .mapToObj(position -> readNativeValue(type, block, position)) + .distinct() + .limit(distinctValues.get(column)) + .collect(toImmutableList()); + columnValues.put(column, values); + } + } + return columnValues.buildOrThrow(); + } + + @Override + public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(ConnectorSession session, ConnectorTableMetadata tableMetadata) + { + return new TableStatisticsMetadata( + tableMetadata.getColumns().stream() + .filter(column -> !column.isHidden()) + .flatMap(column -> Stream.of( + new ColumnStatisticMetadata(column.getName(), MIN_VALUE), + new ColumnStatisticMetadata(column.getName(), MAX_VALUE), + new ColumnStatisticMetadata(column.getName(), NUMBER_OF_DISTINCT_VALUES), + new ColumnStatisticMetadata(column.getName(), NUMBER_OF_NON_NULL_VALUES))) + .collect(toImmutableSet()), + Set.of(ROW_COUNT), + List.of()); + } + @Override public synchronized List listViews(ConnectorSession session, Optional schemaName) { diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSinkProvider.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSinkProvider.java index f1ec8d0613d2..2ed4593b897b 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSinkProvider.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSinkProvider.java @@ -30,6 +30,10 @@ import static java.util.concurrent.CompletableFuture.completedFuture; +/** + * Faker connector doesn't write any data, the passthrough implementation here is provided only + * to allow CTAS queries to collect statistics about source data for table creation in Faker. + */ public class FakerPageSinkProvider implements ConnectorPageSinkProvider { @@ -51,7 +55,7 @@ private static class PageSink @Override public CompletableFuture appendPage(Page page) { - throw new UnsupportedOperationException("The faker connector does not support writes"); + return NOT_BLOCKED; } @Override @@ -63,7 +67,6 @@ public CompletableFuture> finish() @Override public void abort() { - throw new UnsupportedOperationException("The faker connector does not support writes"); } } } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/Literal.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/Literal.java index 410f259bcb0c..6d2afae5fafc 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/Literal.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/Literal.java @@ -16,22 +16,40 @@ import com.google.common.base.CharMatcher; import com.google.common.io.BaseEncoding; import com.google.common.net.InetAddresses; +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import io.trino.spi.type.CharType; import io.trino.spi.type.DecimalType; import io.trino.spi.type.Int128; +import io.trino.spi.type.LongTimeWithTimeZone; +import io.trino.spi.type.LongTimestamp; +import io.trino.spi.type.LongTimestampWithTimeZone; +import io.trino.spi.type.SqlTime; +import io.trino.spi.type.SqlTimeWithTimeZone; +import io.trino.spi.type.SqlTimestamp; +import io.trino.spi.type.SqlTimestampWithTimeZone; +import io.trino.spi.type.SqlVarbinary; import io.trino.spi.type.TimeType; import io.trino.spi.type.TimeWithTimeZoneType; import io.trino.spi.type.TimestampType; import io.trino.spi.type.TimestampWithTimeZoneType; import io.trino.spi.type.Type; import io.trino.spi.type.UuidType; +import io.trino.spi.type.VarbinaryType; import io.trino.spi.type.VarcharType; import io.trino.sql.tree.IntervalLiteral; +import io.trino.type.IntervalDayTimeType; +import io.trino.type.IntervalYearMonthType; import io.trino.type.IpAddressType; +import java.math.BigDecimal; import java.math.BigInteger; +import java.math.MathContext; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.time.LocalDate; import java.util.Optional; import java.util.UUID; import java.util.regex.MatchResult; @@ -39,17 +57,26 @@ import java.util.regex.Pattern; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.slice.Slices.wrappedBuffer; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc; +import static io.trino.spi.type.DateTimeEncoding.unpackOffsetMinutes; +import static io.trino.spi.type.DateTimeEncoding.unpackTimeNanos; +import static io.trino.spi.type.DateTimeEncoding.unpackZoneKey; import static io.trino.spi.type.DateType.DATE; import static io.trino.spi.type.DoubleType.DOUBLE; import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.RealType.REAL; import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TimeZoneKey.getTimeZoneKey; +import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; import static io.trino.spi.type.TinyintType.TINYINT; import static io.trino.spi.type.UuidType.javaUuidToTrinoUuid; +import static io.trino.spi.type.UuidType.trinoUuidToJavaUuid; import static io.trino.spi.type.VarbinaryType.VARBINARY; import static io.trino.type.DateTimes.parseTime; import static io.trino.type.DateTimes.parseTimeWithTimeZone; @@ -61,6 +88,10 @@ import static io.trino.util.DateTimeUtils.parseDayTimeInterval; import static io.trino.util.DateTimeUtils.parseYearMonthInterval; import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.floorDiv; +import static java.lang.Math.floorMod; +import static java.lang.Math.toIntExact; import static java.lang.System.arraycopy; import static java.util.Locale.ENGLISH; @@ -247,4 +278,150 @@ else if (address.length == 16) { return wrappedBuffer(bytes); } + + public static String format(Type type, Object value) + { + Class javaType = type.getJavaType(); + if (javaType == boolean.class) { + boolean typedValue = (boolean) value; + return Boolean.toString(typedValue); + } + + if (javaType == double.class) { + double typedValue = (double) value; + return Double.toString(typedValue); + } + + if (javaType == long.class) { + long typedValue = (long) value; + + if (type == TINYINT) { + return Byte.toString(SignedBytes.checkedCast(typedValue)); + } + + if (type == REAL) { + return Float.toString(intBitsToFloat(toIntExact(typedValue))); + } + + if (type == DATE) { + return LocalDate.ofEpochDay(typedValue).toString(); + } + + if (type == BIGINT) { + return Long.toString(typedValue); + } + + if (type == INTEGER) { + return Integer.toString(toIntExact(typedValue)); + } + + if (type instanceof DecimalType decimalType) { + BigInteger unscaledValue = BigInteger.valueOf(typedValue); + BigDecimal bigDecimal = new BigDecimal(unscaledValue, decimalType.getScale(), new MathContext(decimalType.getPrecision())); + return bigDecimal.toString(); + } + + if (type == SMALLINT) { + return Short.toString(Shorts.checkedCast(typedValue)); + } + + if (type instanceof TimeType timeType) { + return SqlTime.newInstance(timeType.getPrecision(), typedValue).toString(); + } + + if (type instanceof TimestampType timestampType) { + if (timestampType.isShort()) { + return SqlTimestamp.newInstance(timestampType.getPrecision(), typedValue, 0).toString(); + } + } + + if (type instanceof TimeWithTimeZoneType timeWithTimeZoneType) { + verify(timeWithTimeZoneType.isShort(), "Short TimeWithTimeZoneType was expected"); + return SqlTimeWithTimeZone.newInstance( + timeWithTimeZoneType.getPrecision(), + unpackTimeNanos(typedValue) * PICOSECONDS_PER_NANOSECOND, + unpackOffsetMinutes(typedValue)).toString(); + } + + if (type instanceof TimestampWithTimeZoneType timestampWithTimeZoneType) { + verify(timestampWithTimeZoneType.isShort(), "Short TimestampWithTimezone was expected"); + return SqlTimestampWithTimeZone.newInstance( + timestampWithTimeZoneType.getPrecision(), + unpackMillisUtc(typedValue), + 0, + unpackZoneKey(typedValue)).toString(); + } + + if (type instanceof IntervalDayTimeType) { + long epochSeconds = floorDiv(typedValue, (long) MILLISECONDS_PER_SECOND); + long fractionalSecond = floorMod(typedValue, (long) MILLISECONDS_PER_SECOND); + return "%d.%03d".formatted(epochSeconds, fractionalSecond); + } + + if (type instanceof IntervalYearMonthType) { + return Long.toString(typedValue); + } + } + + if (javaType == Slice.class) { + Slice typedValue = (Slice) value; + switch (type) { + case VarcharType _, CharType _ -> { + return typedValue.toStringUtf8(); + } + case VarbinaryType _ -> { + return new SqlVarbinary(typedValue.getBytes()).toString(); + } + case IpAddressType _ -> { + try { + return InetAddresses.toAddrString(InetAddress.getByAddress(typedValue.getBytes())); + } + catch (UnknownHostException e) { + throw new RuntimeException(e); + } + } + case UuidType _ -> { + return trinoUuidToJavaUuid(typedValue).toString(); + } + default -> throw new UnsupportedOperationException("Unsupported type " + type + " backed by java " + javaType); + } + } + + if (javaType == Int128.class) { + if (type instanceof DecimalType decimalType) { + Int128 typedValue = (Int128) value; + + BigInteger unscaledValue = typedValue.toBigInteger(); + BigDecimal bigDecimal = new BigDecimal(unscaledValue, decimalType.getScale(), new MathContext(decimalType.getPrecision())); + return bigDecimal.toPlainString(); + } + } + + switch (type) { + case TimeWithTimeZoneType timeWithTimeZoneType when javaType == LongTimeWithTimeZone.class -> { + LongTimeWithTimeZone typedValue = (LongTimeWithTimeZone) value; + return SqlTimeWithTimeZone.newInstance( + timeWithTimeZoneType.getPrecision(), + typedValue.getPicoseconds(), + typedValue.getOffsetMinutes()).toString(); + } + case TimestampType timestampType when javaType == LongTimestamp.class -> { + LongTimestamp typedValue = (LongTimestamp) value; + + return SqlTimestamp.newInstance( + timestampType.getPrecision(), + typedValue.getEpochMicros(), + typedValue.getPicosOfMicro()).toString(); + } + case TimestampWithTimeZoneType timestampWithTimeZoneType when javaType == LongTimestampWithTimeZone.class -> { + LongTimestampWithTimeZone typedValue = (LongTimestampWithTimeZone) value; + return SqlTimestampWithTimeZone.newInstance( + timestampWithTimeZoneType.getPrecision(), + typedValue.getEpochMillis(), + typedValue.getPicosOfMilli(), + getTimeZoneKey(typedValue.getTimeZoneKey())).toString(); + } + default -> throw new UnsupportedOperationException("Unsupported type " + type + " backed by java " + javaType); + } + } } diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/SchemaInfo.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/SchemaInfo.java index 451823a6f4c1..e15618c0a03a 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/SchemaInfo.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/SchemaInfo.java @@ -23,6 +23,8 @@ public record SchemaInfo(String name, Map properties) { public static final String NULL_PROBABILITY_PROPERTY = "null_probability"; public static final String DEFAULT_LIMIT_PROPERTY = "default_limit"; + public static final String SEQUENCE_DETECTION_ENABLED = "sequence_detection_enabled"; + public static final String DICTIONARY_DETECTION_ENABLED = "dictionary_detection_enabled"; public SchemaInfo { diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/TableInfo.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/TableInfo.java index e2a061a91394..0012f71f538b 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/TableInfo.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/TableInfo.java @@ -27,6 +27,8 @@ public record TableInfo(List columns, Map properties { public static final String NULL_PROBABILITY_PROPERTY = "null_probability"; public static final String DEFAULT_LIMIT_PROPERTY = "default_limit"; + public static final String SEQUENCE_DETECTION_ENABLED = "sequence_detection_enabled"; + public static final String DICTIONARY_DETECTION_ENABLED = "dictionary_detection_enabled"; public TableInfo { diff --git a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerConfig.java b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerConfig.java index 14c8d4f54f69..49bc648f9d56 100644 --- a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerConfig.java +++ b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerConfig.java @@ -30,7 +30,9 @@ void testDefaults() assertRecordedDefaults(recordDefaults(FakerConfig.class) .setNullProbability(0.5) .setDefaultLimit(1000L) - .setLocale("en")); + .setLocale("en") + .setSequenceDetectionEnabled(true) + .setDictionaryDetectionEnabled(true)); } @Test @@ -40,12 +42,16 @@ void testExplicitPropertyMappings() .put("faker.null-probability", "1.0") .put("faker.default-limit", "10") .put("faker.locale", "pl-PL") + .put("faker.sequence-detection-enabled", "false") + .put("faker.dictionary-detection-enabled", "false") .buildOrThrow(); FakerConfig expected = new FakerConfig() .setNullProbability(1.0) .setDefaultLimit(10L) - .setLocale("pl-PL"); + .setLocale("pl-PL") + .setSequenceDetectionEnabled(false) + .setDictionaryDetectionEnabled(false); assertFullMapping(properties, expected); } diff --git a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java index 97f3ecf72f2a..643c04674b06 100644 --- a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java +++ b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java @@ -495,4 +495,101 @@ void testRenameTableAcrossSchema() assertUpdate("DROP TABLE new_schema.renamed_table"); assertUpdate("DROP SCHEMA new_schema"); } + + @Test + void testCreateTableAsSelect() + { + assertUpdate("CREATE TABLE faker.default.limited_range WITH (null_probability = 0, default_limit = 50, dictionary_detection_enabled = false) AS " + + "SELECT * FROM (VALUES -1, 3, 5) t(id)", 3); + + assertQuery("SELECT count(id) FROM (SELECT id FROM limited_range) a", + "VALUES (50)"); + + assertQueryFails("INSERT INTO faker.default.limited_range(id) VALUES (10)", "This connector does not support inserts"); + + assertUpdate("DROP TABLE faker.default.limited_range"); + + List testCases = ImmutableList.builder() + .add(new TestDataType("rnd_bigint", "bigint", Map.of("min", "0", "max", "1"), "count(distinct rnd_bigint)", "2")) + .add(new TestDataType("rnd_integer", "integer", Map.of("min", "0", "max", "1"), "count(distinct rnd_integer)", "2")) + .add(new TestDataType("rnd_smallint", "smallint", Map.of("min", "0", "max", "1"), "count(distinct rnd_smallint)", "2")) + .add(new TestDataType("rnd_tinyint", "tinyint", Map.of("min", "0", "max", "1"), "count(distinct rnd_tinyint)", "2")) + .add(new TestDataType("rnd_date", "date", Map.of("min", "2022-03-01", "max", "2022-03-02"), "count(distinct rnd_date)", "2")) + .add(new TestDataType("rnd_decimal1", "decimal", Map.of("min", "0", "max", "1"), "count(distinct rnd_decimal1)", "2")) + .add(new TestDataType("rnd_decimal2", "decimal(18,5)", Map.of("min", "0.00000", "max", "0.00001"), "count(distinct rnd_decimal2)", "2")) + .add(new TestDataType("rnd_decimal3", "decimal(38,0)", Map.of("min", "0", "max", "1"), "count(distinct rnd_decimal3)", "2")) + .add(new TestDataType("rnd_decimal4", "decimal(38,38)", Map.of("min", "0.00000000000000000000000000000000000000", "max", "0.00000000000000000000000000000000000001"), "count(distinct rnd_decimal4)", "2")) + .add(new TestDataType("rnd_decimal5", "decimal(5,2)", Map.of("min", "0.00", "max", "0.01"), "count(distinct rnd_decimal5)", "2")) + .add(new TestDataType("rnd_real", "real", Map.of("min", "0.0", "max", "1.4E-45"), "count(distinct rnd_real)", "2")) + .add(new TestDataType("rnd_double", "double", Map.of("min", "0.0", "max", "4.9E-324"), "count(distinct rnd_double)", "2")) + .add(new TestDataType("rnd_interval1", "interval day to second", Map.of("min", "0.000", "max", "0.001"), "count(distinct rnd_interval1)", "2")) + .add(new TestDataType("rnd_interval2", "interval year to month", Map.of("min", "0", "max", "1"), "count(distinct rnd_interval2)", "2")) + .add(new TestDataType("rnd_timestamp", "timestamp", Map.of("min", "2022-03-21 00:00:00.000", "max", "2022-03-21 00:00:00.001"), "count(distinct rnd_timestamp)", "2")) + .add(new TestDataType("rnd_timestamp0", "timestamp(0)", Map.of("min", "2022-03-21 00:00:00", "max", "2022-03-21 00:00:01"), "count(distinct rnd_timestamp0)", "2")) + .add(new TestDataType("rnd_timestamp6", "timestamp(6)", Map.of("min", "2022-03-21 00:00:00.000000", "max", "2022-03-21 00:00:00.000001"), "count(distinct rnd_timestamp6)", "2")) + .add(new TestDataType("rnd_timestamp9", "timestamp(9)", Map.of("min", "2022-03-21 00:00:00.000000000", "max", "2022-03-21 00:00:00.000000001"), "count(distinct rnd_timestamp9)", "2")) + .add(new TestDataType("rnd_timestamptz", "timestamp with time zone", Map.of("min", "2022-03-21 00:00:00.000 +01:00", "max", "2022-03-21 00:00:00.001 +01:00"), "count(distinct rnd_timestamptz)", "2")) + .add(new TestDataType("rnd_timestamptz0", "timestamp(0) with time zone", Map.of("min", "2022-03-21 00:00:00 +01:00", "max", "2022-03-21 00:00:01 +01:00"), "count(distinct rnd_timestamptz0)", "2")) + .add(new TestDataType("rnd_timestamptz6", "timestamp(6) with time zone", Map.of("min", "2022-03-21 00:00:00.000000 +01:00", "max", "2022-03-21 00:00:00.000001 +01:00"), "count(distinct rnd_timestamptz6)", "2")) + .add(new TestDataType("rnd_timestamptz9", "timestamp(9) with time zone", Map.of("min", "2022-03-21 00:00:00.000000000 +01:00", "max", "2022-03-21 00:00:00.000000001 +01:00"), "count(distinct rnd_timestamptz9)", "2")) + .add(new TestDataType("rnd_time", "time", Map.of("min", "01:02:03.456", "max", "01:02:03.457"), "count(distinct rnd_time)", "2")) + .add(new TestDataType("rnd_time0", "time(0)", Map.of("min", "01:02:03", "max", "01:02:04"), "count(distinct rnd_time0)", "2")) + .add(new TestDataType("rnd_time6", "time(6)", Map.of("min", "01:02:03.000456", "max", "01:02:03.000457"), "count(distinct rnd_time6)", "2")) + .add(new TestDataType("rnd_time9", "time(9)", Map.of("min", "01:02:03.000000456", "max", "01:02:03.000000457"), "count(distinct rnd_time9)", "2")) + .add(new TestDataType("rnd_timetz", "time with time zone", Map.of("min", "01:02:03.456 +01:00", "max", "01:02:03.457 +01:00"), "count(distinct rnd_timetz)", "2")) + .add(new TestDataType("rnd_timetz0", "time(0) with time zone", Map.of("min", "01:02:03 +01:00", "max", "01:02:04 +01:00"), "count(distinct rnd_timetz0)", "2")) + .add(new TestDataType("rnd_timetz6", "time(6) with time zone", Map.of("min", "01:02:03.000456 +01:00", "max", "01:02:03.000457 +01:00"), "count(distinct rnd_timetz6)", "2")) + .add(new TestDataType("rnd_timetz9", "time(9) with time zone", Map.of("min", "01:02:03.000000456 +01:00", "max", "01:02:03.000000457 +01:00"), "count(distinct rnd_timetz9)", "2")) + .add(new TestDataType("rnd_timetz12", "time(12) with time zone", Map.of("min", "01:02:03.000000000456 +01:00", "max", "01:02:03.000000000457 +01:00"), "count(distinct rnd_timetz12)", "2")) + .build(); + + for (TestDataType testCase : testCases) { + try (TestTable sourceTable = new TestTable(getQueryRunner()::execute, "ctas_src_" + testCase.name(), "(%s) WITH (null_probability = 0, default_limit = 1000)".formatted(testCase.columnSchema())); + TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, dictionary_detection_enabled = false, sequence_detection_enabled = false) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) { + assertQuery("SELECT %s FROM %s".formatted(testCase.queryExpression(), table.getName()), "VALUES (%s)".formatted(testCase.expectedValue())); + } + } + + for (TestDataType testCase : testCases) { + try (TestTable sourceTable = new TestTable(getQueryRunner()::execute, "ctas_src_" + testCase.name(), "(%s %s) WITH (null_probability = 0, default_limit = 2)".formatted(testCase.name(), testCase.type())); + TestTable table = new TestTable(getQueryRunner()::execute, "ctas_" + testCase.name(), "WITH (null_probability = 0, default_limit = 1000, sequence_detection_enabled = false) AS SELECT %s FROM %s".formatted(testCase.name(), sourceTable.getName()))) { + assertQuery("SELECT %s FROM %s".formatted(testCase.queryExpression(), table.getName()), "VALUES (%s)".formatted(testCase.expectedValue())); + } + } + } + + @Test + void testCreateTableAsSelectSequence() + { + String source = """ + SELECT + cast(greatest(least(sequential_number, 0x7f), -0x80) AS TINYINT) AS seq_tinyint, + cast(sequential_number AS SMALLINT) AS seq_smallint, + cast(sequential_number AS INTEGER) AS seq_integer, + cast(sequential_number AS BIGINT) AS seq_bigint + FROM TABLE(sequence(start => -500, stop => 500, step => 1)) + """; + try (TestTable sourceTable = new TestTable(getQueryRunner()::execute, "seq_src", "WITH (null_probability = 0, default_limit = 1000, dictionary_detection_enabled = false) AS " + source); + TestTable table = new TestTable(getQueryRunner()::execute, "seq", "WITH (null_probability = 0, default_limit = 1000, dictionary_detection_enabled = false) AS SELECT * FROM %s".formatted(sourceTable.getName()))) { + String createTable = (String) computeScalar("SHOW CREATE TABLE " + table.getName()); + assertThat(createTable).containsPattern("seq_tinyint tinyint WITH \\(max = '\\d+', min = '-\\d+'\\)"); + assertThat(createTable).containsPattern("seq_smallint smallint WITH \\(max = '\\d+', min = '-\\d+', step = '1'\\)"); + assertThat(createTable).containsPattern("seq_integer integer WITH \\(max = '\\d+', min = '-\\d+', step = '1'\\)"); + assertThat(createTable).containsPattern("seq_bigint bigint WITH \\(max = '\\d+', min = '-\\d+', step = '1'\\)"); + } + } + + @Test + void testCreateTableAsSelectNulls() + { + String source = """ + SELECT + cast(NULL AS INTEGER) AS nullable + FROM TABLE(sequence(start => 0, stop => 1000, step => 1)) + """; + try (TestTable table = new TestTable(getQueryRunner()::execute, "only_nulls", "WITH (dictionary_detection_enabled = false) AS " + source)) { + String createTable = (String) computeScalar("SHOW CREATE TABLE " + table.getName()); + assertThat(createTable).containsPattern("nullable integer WITH \\(null_probability = 1E0\\)"); + } + } }