-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Update MetricsConfig to use a default for first 32 columns #5215
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,13 +29,17 @@ | |
| import org.apache.iceberg.relocated.com.google.common.base.Joiner; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.util.PropertyUtil; | ||
| import org.apache.iceberg.util.SerializableMap; | ||
| import org.apache.iceberg.util.SortOrderUtil; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE; | ||
| import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT; | ||
| import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS; | ||
| import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; | ||
| import static org.apache.iceberg.TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX; | ||
|
|
||
| @Immutable | ||
|
|
@@ -45,9 +49,8 @@ public final class MetricsConfig implements Serializable { | |
| private static final Joiner DOT = Joiner.on('.'); | ||
|
|
||
| // Disable metrics by default for wide tables to prevent excessive metadata | ||
| private static final int MAX_COLUMNS = 32; | ||
| private static final MetricsConfig DEFAULT = new MetricsConfig(ImmutableMap.of(), | ||
| MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT)); | ||
| private static final MetricsMode DEFAULT_MODE = MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT); | ||
| private static final MetricsConfig DEFAULT = new MetricsConfig(ImmutableMap.of(), DEFAULT_MODE); | ||
|
|
||
| private final Map<String, MetricsMode> columnModes; | ||
| private final MetricsMode defaultMode; | ||
|
|
@@ -96,21 +99,15 @@ static Map<String, String> updateProperties(Map<String, String> props, List<Stri | |
| **/ | ||
| @Deprecated | ||
| public static MetricsConfig fromProperties(Map<String, String> props) { | ||
| return from(props, null, DEFAULT_WRITE_METRICS_MODE_DEFAULT); | ||
| return from(props, null, null); | ||
| } | ||
|
|
||
| /** | ||
| * Creates a metrics config from a table. | ||
| * @param table iceberg table | ||
| */ | ||
| public static MetricsConfig forTable(Table table) { | ||
| String defaultMode; | ||
| if (table.schema().columns().size() <= MAX_COLUMNS) { | ||
| defaultMode = DEFAULT_WRITE_METRICS_MODE_DEFAULT; | ||
| } else { | ||
| defaultMode = MetricsModes.None.get().toString(); | ||
| } | ||
| return from(table.properties(), table.sortOrder(), defaultMode); | ||
| return from(table.properties(), table.schema(), table.sortOrder()); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -136,50 +133,55 @@ public static MetricsConfig forPositionDelete(Table table) { | |
| } | ||
|
|
||
| /** | ||
| * Generate a MetricsConfig for all columns based on overrides, sortOrder, and defaultMode. | ||
| * Generate a MetricsConfig for all columns based on overrides, schema, and sort order. | ||
| * | ||
| * @param props will be read for metrics overrides (write.metadata.metrics.column.*) and default | ||
| * (write.metadata.metrics.default) | ||
| * @param schema table schema | ||
| * @param order sort order columns, will be promoted to truncate(16) | ||
| * @param defaultMode default, if not set by user property | ||
| * @return metrics configuration | ||
| */ | ||
| private static MetricsConfig from(Map<String, String> props, SortOrder order, String defaultMode) { | ||
| private static MetricsConfig from(Map<String, String> props, Schema schema, SortOrder order) { | ||
| int maxInferredDefaultColumns = maxInferredColumnDefaults(props); | ||
| Map<String, MetricsMode> columnModes = Maps.newHashMap(); | ||
|
|
||
| // Handle user override of default mode | ||
| MetricsMode finalDefaultMode; | ||
| String defaultModeAsString = props.getOrDefault(DEFAULT_WRITE_METRICS_MODE, defaultMode); | ||
| try { | ||
| finalDefaultMode = MetricsModes.fromString(defaultModeAsString); | ||
| } catch (IllegalArgumentException err) { | ||
| // User override was invalid, log the error and use the default | ||
| LOG.warn("Ignoring invalid default metrics mode: {}", defaultModeAsString, err); | ||
| finalDefaultMode = MetricsModes.fromString(defaultMode); | ||
| MetricsMode defaultMode; | ||
| String configuredDefault = props.get(DEFAULT_WRITE_METRICS_MODE); | ||
| if (configuredDefault != null) { | ||
| // a user-configured default mode is applied for all columns | ||
| defaultMode = parseMode(configuredDefault, DEFAULT_MODE, "default"); | ||
|
|
||
| } else if (schema == null || schema.columns().size() <= maxInferredDefaultColumns) { | ||
| // there are less than the inferred limit, so the default is used everywhere | ||
| defaultMode = DEFAULT_MODE; | ||
|
|
||
| } else { | ||
| // an inferred default mode is applied to the first few columns, up to the limit | ||
| Schema subSchema = new Schema(schema.columns().subList(0, maxInferredDefaultColumns)); | ||
danielcweeks marked this conversation as resolved.
Show resolved
Hide resolved
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick warning:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a temporary schema so it should be fine.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if I have a highly nested schema? The number of stored metrics can be way more than 32 in that case?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but this is the current behavior. We use the top-level columns for the current check. |
||
| for (Integer id : TypeUtil.getProjectedIds(subSchema)) { | ||
| columnModes.put(subSchema.findColumnName(id), DEFAULT_MODE); | ||
| } | ||
|
|
||
| // all other columns don't use metrics | ||
| defaultMode = MetricsModes.None.get(); | ||
| } | ||
|
|
||
| // First set sorted column with sorted column default (can be overridden by user) | ||
| MetricsMode sortedColDefaultMode = sortedColumnDefaultMode(finalDefaultMode); | ||
| MetricsMode sortedColDefaultMode = sortedColumnDefaultMode(defaultMode); | ||
| Set<String> sortedCols = SortOrderUtil.orderPreservingSortedColumns(order); | ||
| sortedCols.forEach(sc -> columnModes.put(sc, sortedColDefaultMode)); | ||
|
|
||
| // Handle user overrides of defaults | ||
| MetricsMode finalDefaultModeVal = finalDefaultMode; | ||
| props.keySet().stream() | ||
| .filter(key -> key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX)) | ||
| .forEach(key -> { | ||
| String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, ""); | ||
| MetricsMode mode; | ||
| try { | ||
| mode = MetricsModes.fromString(props.get(key)); | ||
| } catch (IllegalArgumentException err) { | ||
| // Mode was invalid, log the error and use the default | ||
| LOG.warn("Ignoring invalid metrics mode for column {}: {}", columnAlias, props.get(key), err); | ||
| mode = finalDefaultModeVal; | ||
| } | ||
| columnModes.put(columnAlias, mode); | ||
| }); | ||
| for (String key : props.keySet()) { | ||
| if (key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX)) { | ||
| String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, ""); | ||
| MetricsMode mode = parseMode(props.get(key), defaultMode, "column " + columnAlias); | ||
| columnModes.put(columnAlias, mode); | ||
| } | ||
| } | ||
|
|
||
| return new MetricsConfig(columnModes, finalDefaultMode); | ||
| return new MetricsConfig(columnModes, defaultMode); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -195,6 +197,29 @@ private static MetricsMode sortedColumnDefaultMode(MetricsMode defaultMode) { | |
| } | ||
| } | ||
|
|
||
| private static int maxInferredColumnDefaults(Map<String, String> properties) { | ||
| int maxInferredDefaultColumns = PropertyUtil.propertyAsInt(properties, | ||
| METRICS_MAX_INFERRED_COLUMN_DEFAULTS, METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT); | ||
| if (maxInferredDefaultColumns < 0) { | ||
| LOG.warn("Invalid value for {} (negative): {}, falling back to {}", | ||
| METRICS_MAX_INFERRED_COLUMN_DEFAULTS, maxInferredDefaultColumns, | ||
| METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT); | ||
| return METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; | ||
| } else { | ||
| return maxInferredDefaultColumns; | ||
| } | ||
| } | ||
|
|
||
| private static MetricsMode parseMode(String modeString, MetricsMode fallback, String context) { | ||
| try { | ||
| return MetricsModes.fromString(modeString); | ||
| } catch (IllegalArgumentException err) { | ||
| // User override was invalid, log the error and use the default | ||
| LOG.warn("Ignoring invalid metrics mode ({}): {}", context, modeString, err); | ||
szehon-ho marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return fallback; | ||
| } | ||
| } | ||
|
|
||
| public void validateReferencedColumns(Schema schema) { | ||
| for (String column : columnModes.keySet()) { | ||
| ValidationException.check( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,6 +267,10 @@ private TableProperties() { | |
| public static final String METADATA_DELETE_AFTER_COMMIT_ENABLED = "write.metadata.delete-after-commit.enabled"; | ||
| public static final boolean METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT = false; | ||
|
|
||
| public static final String METRICS_MAX_INFERRED_COLUMN_DEFAULTS = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I debated what to call this. The problem is that we are defaulting the default. You can explicitly set a default using
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with this table property, initially I had made one but it was taken out during the discussions. Indeed it's a bit of a confusing config, but I dont see any other great option.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The naming felt a little bit confusing to me too. After I read the explanation, it started to make more sense. However, I am still not sure
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with "persist" and similar is that it misses the distinction between an explicit default (when What about changing this to include
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's just keep it as-is then. I don't think it is a big deal. |
||
| "write.metadata.metrics.max-inferred-column-defaults"; | ||
| public static final int METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT = 32; | ||
|
|
||
| public static final String METRICS_MODE_COLUMN_CONF_PREFIX = "write.metadata.metrics.column."; | ||
| public static final String DEFAULT_WRITE_METRICS_MODE = "write.metadata.metrics.default"; | ||
| public static final String DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)"; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@danielcweeks Is this a left-over comment?