-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Consolidate write.folder-storage.path and write.object-storage.path to write.data.path #3094
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
b12ec79
abcf950
85c9f66
45a79a0
e4517b7
9579578
5402407
0c3209b
6018496
df70964
884cbf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,10 +28,11 @@ | |
| import org.apache.iceberg.transforms.Transforms; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.util.PropertyUtil; | ||
|
|
||
| import static org.apache.iceberg.TableProperties.OBJECT_STORE_PATH; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| public class LocationProviders { | ||
| private static final Logger LOG = LoggerFactory.getLogger(LocationProviders.class); | ||
|
|
||
| private LocationProviders() { | ||
| } | ||
|
|
@@ -68,16 +69,12 @@ public static LocationProvider locationsFor(String location, Map<String, String> | |
| } | ||
| } | ||
|
|
||
| private static String defaultDataLocation(String tableLocation, Map<String, String> properties) { | ||
| return properties.getOrDefault(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, | ||
| String.format("%s/data", tableLocation)); | ||
| } | ||
|
|
||
| static class DefaultLocationProvider implements LocationProvider { | ||
| private final String dataLocation; | ||
|
|
||
| DefaultLocationProvider(String tableLocation, Map<String, String> properties) { | ||
| this.dataLocation = stripTrailingSlash(defaultDataLocation(tableLocation, properties)); | ||
| this.dataLocation = | ||
| stripTrailingSlash(dataLocation(properties, tableLocation, false)); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -99,8 +96,8 @@ static class ObjectStoreLocationProvider implements LocationProvider { | |
| private final String context; | ||
|
|
||
| ObjectStoreLocationProvider(String tableLocation, Map<String, String> properties) { | ||
| this.storageLocation = stripTrailingSlash(properties.getOrDefault(OBJECT_STORE_PATH, | ||
| defaultDataLocation(tableLocation, properties))); | ||
| this.storageLocation = | ||
| stripTrailingSlash(dataLocation(properties, tableLocation, true)); | ||
flyrain marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| this.context = pathContext(tableLocation); | ||
| } | ||
|
|
||
|
|
@@ -141,4 +138,34 @@ private static String stripTrailingSlash(String path) { | |
| } | ||
| return result; | ||
| } | ||
|
|
||
| private static String dataLocation(Map<String, String> properties, String tableLocation, boolean isObjectStore) { | ||
|
||
| String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); | ||
| if (dataLocation == null) { | ||
| String deprecatedProperty = null; | ||
|
||
| if (isObjectStore) { | ||
| deprecatedProperty = TableProperties.OBJECT_STORE_PATH; | ||
| } | ||
|
|
||
| dataLocation = properties.get(deprecatedProperty); | ||
|
||
| if (dataLocation == null) { | ||
| dataLocation = properties.get(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); | ||
| if (dataLocation == null) { | ||
| dataLocation = String.format("%s/data", tableLocation); | ||
| } else { | ||
| logWarnForDeprecatedProperty(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); | ||
| } | ||
| } else { | ||
| logWarnForDeprecatedProperty(deprecatedProperty); | ||
| } | ||
| } | ||
| return dataLocation; | ||
| } | ||
|
|
||
| private static void logWarnForDeprecatedProperty(String deprecatedProperty) { | ||
| if (deprecatedProperty != null) { | ||
| LOG.warn("Table property {} is deprecated, please use {} instead.", deprecatedProperty, | ||
| TableProperties.WRITE_DATA_LOCATION); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,21 +135,31 @@ private TableProperties() { | |
| public static final String OBJECT_STORE_ENABLED = "write.object-storage.enabled"; | ||
| public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false; | ||
|
|
||
| /** | ||
| * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. | ||
| */ | ||
| @Deprecated | ||
| public static final String OBJECT_STORE_PATH = "write.object-storage.path"; | ||
|
|
||
| public static final String WRITE_LOCATION_PROVIDER_IMPL = "write.location-provider.impl"; | ||
|
|
||
| // This only applies to files written after this property is set. Files previously written aren't | ||
| // relocated to reflect this parameter. | ||
| // If not set, defaults to a "data" folder underneath the root path of the table. | ||
| /** | ||
| * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. | ||
| */ | ||
| @Deprecated | ||
| public static final String WRITE_FOLDER_STORAGE_LOCATION = "write.folder-storage.path"; | ||
|
|
||
| /** | ||
| * @deprecated will be removed in 0.14.0, use {@link #WRITE_FOLDER_STORAGE_LOCATION} instead | ||
| * @deprecated will be removed in 0.14.0, use {@link #WRITE_DATA_LOCATION} instead | ||
| */ | ||
| @Deprecated | ||
| public static final String WRITE_NEW_DATA_LOCATION = "write.folder-storage.path"; | ||
|
|
||
| // This only applies to files written after this property is set. Files previously written aren't | ||
| // relocated to reflect this parameter. | ||
| // If not set, defaults to a "data" folder underneath the root path of the table. | ||
| public static final String WRITE_DATA_LOCATION = "write.data.path"; | ||
|
Comment on lines
+158
to
+161
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Do we want to update the comment for the default if not set to reflect anything about the possibility of object storage location provider? Up to you. The more I think about it, the more I think it just complicates things and that we should just properly document the behavior on the website. But up to you.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can move L161 to after L144 to avoid deleting the comments and add again here. |
||
|
|
||
| // This only applies to files written after this property is set. Files previously written aren't | ||
| // relocated to reflect this parameter. | ||
| // If not set, defaults to a "metadata" folder underneath the root path of the table. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -344,7 +344,7 @@ Data stored in S3 with a traditional Hive storage layout can face S3 request thr | |
|
|
||
| Iceberg by default uses the Hive storage layout, but can be switched to use the `ObjectStoreLocationProvider`. | ||
| With `ObjectStoreLocationProvider`, a determenistic hash is generated for each stored file, with the hash appended | ||
| directly after the `write.object-storage.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.object-storage.path` across your Iceberg tables will improve performance. | ||
| directly after the `write.data.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.data.path` across your Iceberg tables will improve performance. | ||
|
|
||
| For more information on how S3 scales API QPS, checkout the 2018 re:Invent session on [Best Practices for Amazon S3 and Amazon S3 Glacier]( https://youtu.be/rHeTn9pHNKo?t=3219). At [53:39](https://youtu.be/rHeTn9pHNKo?t=3219) it covers how S3 scales/partitions & at [54:50](https://youtu.be/rHeTn9pHNKo?t=3290) it discusses the 30-60 minute wait time before new partitions are created. | ||
|
|
||
|
|
@@ -358,7 +358,7 @@ CREATE TABLE my_catalog.my_ns.my_table ( | |
| USING iceberg | ||
| OPTIONS ( | ||
| 'write.object-storage.enabled'=true, | ||
| 'write.object-storage.path'='s3://my-table-data-bucket') | ||
| 'write.data.path'='s3://my-table-data-bucket') | ||
| PARTITIONED BY (category); | ||
| ``` | ||
|
|
||
|
|
@@ -373,8 +373,7 @@ s3://my-table-data-bucket/2d3905f8/my_ns.db/my_table/category=orders/00000-0-5af | |
| ``` | ||
|
|
||
| Note, the path resolution logic for `ObjectStoreLocationProvider` is as follows: | ||
| - if `write.object-storage.path` is set, use it | ||
| - if not found, fallback to `write.folder-storage.path` | ||
| - if `write.data.path` is set, use it | ||
| - if not found, use `<tableLocation>/data` | ||
|
||
|
|
||
| For more details, please refer to the [LocationProvider Configuration](../custom-catalog/#custom-location-provider-implementation) section. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -168,6 +168,8 @@ protected Map<String, String> destTableProps() { | |
| properties.remove(LOCATION); | ||
| properties.remove(TableProperties.WRITE_METADATA_LOCATION); | ||
| properties.remove(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); | ||
| properties.remove(TableProperties.OBJECT_STORE_PATH); | ||
| properties.remove(TableProperties.WRITE_DATA_LOCATION); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When snapshotting a table that has folder storage location set, should we also remove it in addition to write data location?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It'd be safer to remove both. Added in the new commit.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a PR #2966 for this change, you can either also port my tests here, or remove this and I will update that PR once this is merged.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @jackye1995 , thanks for the information. I've added both
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I can do that |
||
|
|
||
| // set default and user-provided props | ||
| properties.put(TableCatalog.PROP_PROVIDER, "iceberg"); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.