-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Core: add row key to format v2 #2354
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg; | ||
|
|
||
| import java.io.Serializable; | ||
| import java.util.Set; | ||
| import org.apache.iceberg.exceptions.ValidationException; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
| import org.apache.iceberg.types.Types; | ||
|
|
||
| /** | ||
| * Row key of a table. | ||
| * <p> | ||
| * Row key is a definition of table row uniqueness, | ||
| * similar to the concept of primary key in a relational database system. | ||
| * A row should be unique in a table based on the values of an unordered set of {@link RowKeyIdentifierField}. | ||
| * Iceberg itself does not enforce row uniqueness based on this key. | ||
| * It is leveraged by operations such as streaming upsert. | ||
| */ | ||
| public class RowKey implements Serializable { | ||
|
|
||
| private static final RowKey NOT_IDENTIFIED = new RowKey(new Schema(), Sets.newHashSet()); | ||
|
|
||
| private final Schema schema; | ||
| private final RowKeyIdentifierField[] identifierFields; | ||
|
|
||
| private transient volatile Set<RowKeyIdentifierField> identifierFieldSet; | ||
|
|
||
| private RowKey(Schema schema, Set<RowKeyIdentifierField> identifierFields) { | ||
| this.schema = schema; | ||
| this.identifierFields = identifierFields.toArray(new RowKeyIdentifierField[0]); | ||
| } | ||
|
|
||
| /** | ||
| * Returns the {@link Schema} referenced by the row key | ||
| */ | ||
| public Schema schema() { | ||
| return schema; | ||
| } | ||
|
|
||
| /** | ||
| * Return the set of {@link RowKeyIdentifierField} in the row key | ||
| * <p> | ||
| * @return the set of fields in the row key | ||
| */ | ||
| public Set<RowKeyIdentifierField> identifierFields() { | ||
| return lazyIdentifierFieldSet(); | ||
| } | ||
|
|
||
| private Set<RowKeyIdentifierField> lazyIdentifierFieldSet() { | ||
| if (identifierFieldSet == null) { | ||
| synchronized (this) { | ||
| if (identifierFieldSet == null) { | ||
| identifierFieldSet = ImmutableSet.copyOf(identifierFields); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return identifierFieldSet; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the default row key that has no field | ||
| */ | ||
| public static RowKey notIdentified() { | ||
| return NOT_IDENTIFIED; | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if the row key is the default one with no field | ||
| */ | ||
| public boolean isNotIdentified() { | ||
| return identifierFields.length < 1; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object other) { | ||
| if (this == other) { | ||
| return true; | ||
| } else if (other == null || getClass() != other.getClass()) { | ||
| return false; | ||
| } | ||
|
|
||
| RowKey that = (RowKey) other; | ||
| return identifierFields().equals(that.identifierFields()); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return identifierFields().hashCode(); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| StringBuilder sb = new StringBuilder(); | ||
| sb.append("["); | ||
| for (RowKeyIdentifierField field : identifierFields) { | ||
| sb.append("\n"); | ||
| sb.append(" ").append(field); | ||
| } | ||
| if (identifierFields.length > 0) { | ||
| sb.append("\n"); | ||
| } | ||
| sb.append("]"); | ||
| return sb.toString(); | ||
| } | ||
|
|
||
| /** | ||
| * Creates a new {@link Builder row key builder} for the given {@link Schema}. | ||
| * | ||
| * @param schema a schema | ||
| * @return a row key builder for the given schema. | ||
| */ | ||
| public static Builder builderFor(Schema schema) { | ||
| return new Builder(schema); | ||
| } | ||
|
|
||
| /** | ||
| * A builder to create valid {@link RowKey row key}. | ||
| * <p> | ||
| * Call {@link #builderFor(Schema)} to create a new builder. | ||
| */ | ||
| public static class Builder { | ||
| private final Schema schema; | ||
| private final Set<RowKeyIdentifierField> fields = Sets.newHashSet(); | ||
|
|
||
| private Builder(Schema schema) { | ||
| this.schema = schema; | ||
| } | ||
|
|
||
| public Builder addField(String name) { | ||
| Types.NestedField column = schema.findField(name); | ||
| ValidationException.check(column != null, "Cannot find column with name %s in schema %s", name, schema); | ||
| return addField(column); | ||
| } | ||
|
|
||
| public Builder addField(int id) { | ||
| Types.NestedField column = schema.findField(id); | ||
| ValidationException.check(column != null, "Cannot find column with ID %s in schema %s", id, schema); | ||
| return addField(column); | ||
| } | ||
|
|
||
| private Builder addField(Types.NestedField column) { | ||
| ValidationException.check(column.isRequired(), | ||
| "Cannot add column %s to row key because it is not a required column in schema %s", column, schema); | ||
| ValidationException.check(column.type().isPrimitiveType(), | ||
| "Cannot add column %s to row key because it is not a primitive data type in schema %s", column, schema); | ||
| fields.add(new RowKeyIdentifierField(column.fieldId())); | ||
| return this; | ||
| } | ||
|
|
||
| public RowKey build() { | ||
| if (fields.size() == 0) { | ||
| return NOT_IDENTIFIED; | ||
| } | ||
|
|
||
| return new RowKey(schema, fields); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg; | ||
|
|
||
| import java.io.Serializable; | ||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * An identifier field in {@link RowKey} | ||
| * <p> | ||
| * The field must be: | ||
| * 1. a required column in the table schema | ||
| * 2. a primitive type column | ||
| */ | ||
| public class RowKeyIdentifierField implements Serializable { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be a class that wraps a single ID. Could we get rid of it? Instead, |
||
|
|
||
| private final int sourceId; | ||
|
|
||
| RowKeyIdentifierField(int sourceId) { | ||
| this.sourceId = sourceId; | ||
| } | ||
|
|
||
| public int sourceId() { | ||
| return sourceId; | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "(" + sourceId + ")"; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object other) { | ||
| if (this == other) { | ||
| return true; | ||
| } else if (other == null || getClass() != other.getClass()) { | ||
| return false; | ||
| } | ||
|
|
||
| RowKeyIdentifierField that = (RowKeyIdentifierField) other; | ||
| return sourceId == that.sourceId; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(sourceId); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.RowKey; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to introduce a createTable method that could accept
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opening a separate issue for this is good enough for me now.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point, I thought about that, and there were 2 reasons that let me decide to not add it:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I agree it's good to use |
||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.SortOrder; | ||
| import org.apache.iceberg.Table; | ||
|
|
@@ -360,6 +361,14 @@ interface TableBuilder { | |
| */ | ||
| TableBuilder withSortOrder(SortOrder sortOrder); | ||
|
|
||
| /** | ||
| * Sets a row key for the table. | ||
| * | ||
| * @param rowKey a row key | ||
| * @return this for method chaining | ||
| */ | ||
| TableBuilder withRowKey(RowKey rowKey); | ||
|
|
||
| /** | ||
| * Sets a location for the table. | ||
| * | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have a strong opinion but I'd go for
RowIdentifierorRowId. I thinkKeymeans uniqueness but I'll be fine this way too as long as we agree Iceberg does not ensure uniqueness.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
keydoes not always meansuniquenessin my mind. from the MySQL document,indexcould be created on top of key columns and theindexcould choose to be unique or non-unique.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we have gone back and forth on this naming, and for now I would prefer the
Keycase becauseIdis heavily used in table metadata to mean concepts such asspec-id,schema-id,order-id, etc. which are the increasing ID of different specs. Using a different keywordKeywould provide more clarity in the table metadata.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with Jack's logic that "id" is typically used in Iceberg to refer to a numeric identifier. It would be odd to use
RowId, especially given the overlap with the JDBC one. But, we have had a significant number of people that find "key" confusing when it is a non-unique "key".What about shifting the focus from the "key" or "identifier" to the fields? We could use
identifier-field-idsto hold the collection and addidentifierFieldIdsto APIs.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I renamed class
RowKeyFieldtoRowKeyIdentifierField, andfieldstoidentifier-fieldsin metadata. Please let me know if that feels better.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personally, I don't have strong opinion about the
RowKeyFieldorRowKeyIdentifierField. I'm okay if you think it's good for one of them.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be more clear, I don't think we should ignore the near consensus from our sync discussion that "key" is misleading. I think we should instead call this class
IdentityFields(or something similar) and storeidentifier-field-idsin table metadata.