sshkvar · sshkvar · Jun 12, 2021 · Jun 14, 2021 · Jun 14, 2021 · Jun 15, 2021
diff --git a/.baseline/checkstyle/checkstyle.xml b/.baseline/checkstyle/checkstyle.xml
@@ -46,6 +46,11 @@
         <property name="format" value="sparkContext\(\)\.hadoopConfiguration\(\)"/>
         <property name="message" value="Are you sure that you want to use sparkContext().hadoopConfiguration()? In most cases, you should use sessionState().newHadoopConf() instead, so that the Hadoop configurations specified in the Spark session configuration will come into effect."/>
     </module>
+    <module name="RegexpSingleline">
+        <property name="fileExtensions" value="java"/>
+        <property name="format" value="new JavaSparkContext\(.*\)"/>
+        <property name="message" value="Prefer using JavaSparkContext.fromSparkContext() instead of calling a constructor directly."/>
+    </module>
     <module name="SuppressionFilter"> <!-- baseline-gradle: README.md -->
         <property name="file" value="${config_loc}/checkstyle-suppressions.xml"/>
     </module>

diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,12 @@ lib/
 # web site build
 site/site
 
+# benchmark output
+spark2/benchmark/*
+!spark2/benchmark/.gitkeep
+spark3/benchmark/*
+!spark3/benchmark/.gitkeep
+
 __pycache__/
 *.py[cod]
 .eggs/

diff --git a/README.md b/README.md
@@ -19,7 +19,8 @@
 
 ![](site/docs/img/Iceberg-logo.png)
 
-[![](https://travis-ci.org/apache/iceberg.svg?branch=master)](https://travis-ci.org/apache/iceberg)
+[![](https://github.com/apache/iceberg/actions/workflows/java-ci.yml/badge.svg)](https://github.com/apache/iceberg/actions/workflows/java-ci.yml)
+[![](https://github.com/apache/iceberg/actions/workflows/python-ci.yml/badge.svg)](https://github.com/apache/iceberg/actions/workflows/python-ci.yml)
 [![Slack](https://img.shields.io/badge/chat-on%20Slack-brightgreen.svg)](https://the-asf.slack.com/archives/CF01LKV9S)
 
 Apache Iceberg is a new table format for storing large, slow-moving tabular data. It is designed to improve on the de-facto standard table layout built into Hive, Trino, and Spark.

diff --git a/api/src/main/java/org/apache/iceberg/ManifestFile.java b/api/src/main/java/org/apache/iceberg/ManifestFile.java
@@ -62,14 +62,16 @@ public interface ManifestFile {
   Types.NestedField PARTITION_SUMMARIES = optional(507, "partitions",
       Types.ListType.ofRequired(508, PARTITION_SUMMARY_TYPE),
       "Summary for each partition");
-  // next ID to assign: 519
+  Types.NestedField KEY_METADATA = optional(519, "key_metadata", Types.BinaryType.get(),
+      "Encryption key metadata blob");
+  // next ID to assign: 520
 
   Schema SCHEMA = new Schema(
       PATH, LENGTH, SPEC_ID, MANIFEST_CONTENT,
       SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER, SNAPSHOT_ID,
       ADDED_FILES_COUNT, EXISTING_FILES_COUNT, DELETED_FILES_COUNT,
       ADDED_ROWS_COUNT, EXISTING_ROWS_COUNT, DELETED_ROWS_COUNT,
-      PARTITION_SUMMARIES);
+      PARTITION_SUMMARIES, KEY_METADATA);
 
   static Schema schema() {
     return SCHEMA;
@@ -179,6 +181,13 @@ default boolean hasDeletedFiles() {
    */
   List<PartitionFieldSummary> partitions();
 
+  /**
+   * Returns metadata about how this manifest file is encrypted, or null if the file is stored in plain text.
+   */
+  default ByteBuffer keyMetadata() {
+    return null;
+  }
+
   /**
    * Copies this {@link ManifestFile manifest file}. Readers can reuse manifest file instances; use
    * this method to make defensive copies.

diff --git a/api/src/main/java/org/apache/iceberg/Schema.java b/api/src/main/java/org/apache/iceberg/Schema.java
@@ -193,11 +193,11 @@ public List<NestedField> columns() {
    * It consists of a unique set of primitive fields in the schema.
    * An identifier field must be at root, or nested in a chain of structs (no maps or lists).
    * A row should be unique in a table based on the values of the identifier fields.
+   * Optional, float and double columns cannot be used as identifier fields.
    * However, Iceberg identifier differs from primary key in the following ways:
    * <ul>
    * <li>Iceberg does not enforce the uniqueness of a row based on this identifier information.
    * It is used for operations like upsert to define the default upsert key.</li>
-   * <li>NULL can be used as value of an identifier field. Iceberg ensures null-safe equality check.</li>
    * <li>A nested field in a struct can be used as an identifier. For example, if there is a "last_name" field
    * inside a "user" struct in a schema, field "user.last_name" can be set as a part of the identifier field.</li>
    * </ul>
@@ -215,7 +215,7 @@ public Set<Integer> identifierFieldIds() {
   public Set<String> identifierFieldNames() {
     return identifierFieldIds()
             .stream()
-            .map(id -> findField(id).name())
+            .map(id -> lazyIdToName().get(id))
             .collect(Collectors.toSet());
   }
 

diff --git a/api/src/main/java/org/apache/iceberg/Snapshot.java b/api/src/main/java/org/apache/iceberg/Snapshot.java
@@ -126,4 +126,13 @@ public interface Snapshot extends Serializable {
    * @return the location of the manifest list for this Snapshot
    */
   String manifestListLocation();
+
+  /**
+   * Return the id of the schema used when this snapshot was created, or null if this information is not available.
+   *
+   * @return schema id associated with this snapshot
+   */
+  default Integer schemaId() {
+    return null;
+  }
 }
diff --git a/api/src/main/java/org/apache/iceberg/SortField.java b/api/src/main/java/org/apache/iceberg/SortField.java
@@ -80,7 +80,7 @@ public NullOrder nullOrder() {
    * @return true if this order satisfies the given order
    */
   public boolean satisfies(SortField other) {
-    if (this == other) {
+    if (Objects.equals(this, other)) {
       return true;
     } else if (sourceId != other.sourceId || direction != other.direction || nullOrder != other.nullOrder) {
       return false;

diff --git a/api/src/main/java/org/apache/iceberg/Table.java b/api/src/main/java/org/apache/iceberg/Table.java
@@ -60,6 +60,13 @@ default String name() {
    */
   Schema schema();
 
+  /**
+   * Return a map of {@link Schema schema} for this table.
+   *
+   * @return this table's schema map
+   */
+  Map<Integer, Schema> schemas();
+
   /**
    * Return the {@link PartitionSpec partition spec} for this table.
    *

diff --git a/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java b/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java
@@ -41,10 +41,10 @@ default MigrateTable migrateTable(String tableIdent) {
   }
 
   /**
-   * Instantiates an action to remove orphan files.
+   * Instantiates an action to delete orphan files.
    */
-  default RemoveOrphanFiles removeOrphanFiles(Table table) {
-    throw new UnsupportedOperationException(this.getClass().getName() + " does not implement removeOrphanFiles");
+  default DeleteOrphanFiles deleteOrphanFiles(Table table) {
+    throw new UnsupportedOperationException(this.getClass().getName() + " does not implement deleteOrphanFiles");
   }
 
   /**
@@ -69,9 +69,9 @@ default ExpireSnapshots expireSnapshots(Table table) {
   }
 
   /**
-   * Instantiates an action to remove all the files reachable from given metadata location.
+   * Instantiates an action to delete all the files reachable from given metadata location.
    */
-  default RemoveReachableFiles removeReachableFiles(String metadataLocation) {
-    throw new UnsupportedOperationException(this.getClass().getName() + " does not implement removeReachableFiles");
+  default DeleteReachableFiles deleteReachableFiles(String metadataLocation) {
+    throw new UnsupportedOperationException(this.getClass().getName() + " does not implement deleteReachableFiles");
   }
 }
diff --git a/...he/iceberg/actions/RemoveOrphanFiles.java → ...he/iceberg/actions/DeleteOrphanFiles.java b/...he/iceberg/actions/RemoveOrphanFiles.java → ...he/iceberg/actions/DeleteOrphanFiles.java
@@ -22,13 +22,13 @@
 import java.util.function.Consumer;
 
 /**
- * An action that removes orphan files in a table.
+ * An action that deletes orphan files in a table.
  * <p>
  * A metadata or data file is considered orphan if it is not reachable by any valid snapshot.
  * The set of actual files is built by listing the underlying storage which makes this operation
  * expensive.
  */
-public interface RemoveOrphanFiles extends Action<RemoveOrphanFiles, RemoveOrphanFiles.Result> {
+public interface DeleteOrphanFiles extends Action<DeleteOrphanFiles, DeleteOrphanFiles.Result> {
   /**
    * Passes a location which should be scanned for orphan files.
    * <p>
@@ -38,7 +38,7 @@ public interface RemoveOrphanFiles extends Action<RemoveOrphanFiles, RemoveOrpha
    * @param location the location where to look for orphan files
    * @return this for method chaining
    */
-  RemoveOrphanFiles location(String location);
+  DeleteOrphanFiles location(String location);
 
   /**
    * Removes orphan files only if they are older than the given timestamp.
@@ -52,7 +52,7 @@ public interface RemoveOrphanFiles extends Action<RemoveOrphanFiles, RemoveOrpha
    * @param olderThanTimestamp a long timestamp, as returned by {@link System#currentTimeMillis()}
    * @return this for method chaining
    */
-  RemoveOrphanFiles olderThan(long olderThanTimestamp);
+  DeleteOrphanFiles olderThan(long olderThanTimestamp);
 
   /**
    * Passes an alternative delete implementation that will be used for orphan files.
@@ -65,7 +65,7 @@ public interface RemoveOrphanFiles extends Action<RemoveOrphanFiles, RemoveOrpha
    * @param deleteFunc a function that will be called to delete files
    * @return this for method chaining
    */
-  RemoveOrphanFiles deleteWith(Consumer<String> deleteFunc);
+  DeleteOrphanFiles deleteWith(Consumer<String> deleteFunc);
 
   /**
    * The action result that contains a summary of the execution.

diff --git a/...iceberg/actions/RemoveReachableFiles.java → ...iceberg/actions/DeleteReachableFiles.java b/...iceberg/actions/RemoveReachableFiles.java → ...iceberg/actions/DeleteReachableFiles.java
@@ -24,24 +24,24 @@
 import org.apache.iceberg.io.FileIO;
 
 /**
- * An action that removes all files referenced by a table metadata file.
+ * An action that deletes all files referenced by a table metadata file.
  * <p>
  * This action will irreversibly delete all reachable files such as data files, manifests,
  * manifest lists and should be used to clean up the underlying storage once a table is dropped
  * and no longer needed.
  * <p>
  * Implementations may use a query engine to distribute parts of work.
  */
-public interface RemoveReachableFiles extends Action<RemoveReachableFiles, RemoveReachableFiles.Result> {
+public interface DeleteReachableFiles extends Action<DeleteReachableFiles, DeleteReachableFiles.Result> {
 
   /**
    * Passes an alternative delete implementation that will be used for files.
    *
-   * @param removeFunc a function that will be called to delete files.
+   * @param deleteFunc a function that will be called to delete files.
    *                   The function accepts path to file as an argument.
    * @return this for method chaining
    */
-  RemoveReachableFiles deleteWith(Consumer<String> removeFunc);
+  DeleteReachableFiles deleteWith(Consumer<String> deleteFunc);
 
   /**
    * Passes an alternative executor service that will be used for files removal.
@@ -51,39 +51,39 @@ public interface RemoveReachableFiles extends Action<RemoveReachableFiles, Remov
    *  @param executorService the service to use
    * @return this for method chaining
    */
-  RemoveReachableFiles executeDeleteWith(ExecutorService executorService);
+  DeleteReachableFiles executeDeleteWith(ExecutorService executorService);
 
   /**
    * Set the {@link FileIO} to be used for files removal
    *
    * @param io FileIO to use for files removal
    * @return this for method chaining
    */
-  RemoveReachableFiles io(FileIO io);
+  DeleteReachableFiles io(FileIO io);
 
   /**
    * The action result that contains a summary of the execution.
    */
   interface Result {
 
     /**
-     * Returns the number of data files removed.
+     * Returns the number of deleted data files.
      */
-    long removedDataFilesCount();
+    long deletedDataFilesCount();
 
     /**
-     * Returns the number of manifests removed.
+     * Returns the number of deleted manifests.
      */
-    long removedManifestsCount();
+    long deletedManifestsCount();
 
     /**
-     * Returns the number of manifest lists removed.
+     * Returns the number of deleted manifest lists.
      */
-    long removedManifestListsCount();
+    long deletedManifestListsCount();
 
     /**
-     * Returns the number of metadata json, version hint files removed.
+     * Returns the number of deleted metadata json, version hint files.
      */
-    long otherRemovedFilesCount();
+    long deletedOtherFilesCount();
   }
 }
diff --git a/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java b/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java
@@ -19,7 +19,7 @@
 
 package org.apache.iceberg.actions;
 
-import java.util.Map;
+import java.util.List;
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.expressions.Expression;
 
@@ -49,10 +49,10 @@ public interface RewriteDataFiles extends SnapshotUpdate<RewriteDataFiles, Rewri
   /**
    * The entire rewrite operation is broken down into pieces based on partitioning and within partitions based
    * on size into groups. These sub-units of the rewrite are referred to as file groups. The largest amount of data that
-   * should be compacted in a single group is controlled by MAX_FILE_GROUP_SIZE_BYTES. This helps with breaking down the
-   * rewriting of very large partitions which may not be rewritable otherwise due to the resource constraints of the
-   * cluster. For example a sort based rewrite may not scale to terabyte sized partitions, those partitions need to be
-   * worked on in small subsections to avoid exhaustion of resources.
+   * should be compacted in a single group is controlled by {@link #MAX_FILE_GROUP_SIZE_BYTES}. This helps with
+   * breaking down the rewriting of very large partitions which may not be rewritable otherwise due to the resource
+   * constraints of the cluster. For example a sort based rewrite may not scale to terabyte sized partitions, those
+   * partitions need to be worked on in small subsections to avoid exhaustion of resources.
    * <p>
    * When grouping files, the underlying rewrite strategy will use this value as to limit the files which
    * will be included in a single file group. A group will be processed by a single framework "action". For example,
@@ -68,20 +68,14 @@ public interface RewriteDataFiles extends SnapshotUpdate<RewriteDataFiles, Rewri
    * independently and asynchronously.
    **/
   String MAX_CONCURRENT_FILE_GROUP_REWRITES = "max-concurrent-file-group-rewrites";
-  int MAX_CONCURRENT_FILE_GROUP_ACTIONS_DEFAULT = 1;
+  int MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT = 1;
 
   /**
    * The output file size that this rewrite strategy will attempt to generate when rewriting files. By default this
    * will use the "write.target-file-size-bytes value" in the table properties of the table being updated.
    */
   String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes";
 
-  /**
-   * The partition spec to use when writing the output data from this operation. By default uses the
-   * current table partition spec.
-   */
-  String OUTPUT_PARTITION_SPEC_ID = "output-partition-spec-id";
-
   /**
    * Choose BINPACK as a strategy for this rewrite operation
    * @return this for method chaining
@@ -106,14 +100,24 @@ default RewriteDataFiles binPack() {
    * will report a total failure for the job.
    */
   interface Result {
-    Map<FileGroupInfo, FileGroupRewriteResult> resultMap();
+    List<FileGroupRewriteResult> rewriteResults();
+
+    default int addedDataFilesCount() {
+      return rewriteResults().stream().mapToInt(FileGroupRewriteResult::addedDataFilesCount).sum();
+    }
+
+    default int rewrittenDataFilesCount() {
+      return rewriteResults().stream().mapToInt(FileGroupRewriteResult::rewrittenDataFilesCount).sum();
+    }
   }
 
   /**
    *  For a particular file group, the number of files which are newly created and the number of files
    *  which were formerly part of the table but have been rewritten.
    */
   interface FileGroupRewriteResult {
+    FileGroupInfo info();
+
     int addedDataFilesCount();
 
     int rewrittenDataFilesCount();

diff --git a/api/src/main/java/org/apache/iceberg/catalog/Namespace.java b/api/src/main/java/org/apache/iceberg/catalog/Namespace.java
@@ -21,6 +21,7 @@
 
 import java.util.Arrays;
 import org.apache.iceberg.relocated.com.google.common.base.Joiner;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 
 /**
  * A namespace in a {@link Catalog}.
@@ -34,6 +35,7 @@ public static Namespace empty() {
   }
 
   public static Namespace of(String... levels) {
+    Preconditions.checkArgument(null != levels, "Cannot create Namespace from null array");
     if (levels.length == 0) {
       return empty();
     }

diff --git a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java
@@ -36,6 +36,7 @@ public class TableIdentifier {
   private final String name;
 
   public static TableIdentifier of(String... names) {
+    Preconditions.checkArgument(names != null, "Cannot create table identifier from null array");
     Preconditions.checkArgument(names.length > 0, "Cannot create table identifier without a table name");
     return new TableIdentifier(Namespace.of(Arrays.copyOf(names, names.length - 1)), names[names.length - 1]);
   }
@@ -45,12 +46,14 @@ public static TableIdentifier of(Namespace namespace, String name) {
   }
 
   public static TableIdentifier parse(String identifier) {
+    Preconditions.checkArgument(identifier != null, "Cannot parse table identifier: null");
     Iterable<String> parts = DOT.split(identifier);
     return TableIdentifier.of(Iterables.toArray(parts, String.class));
   }
 
   private TableIdentifier(Namespace namespace, String name) {
-    Preconditions.checkArgument(name != null && !name.isEmpty(), "Invalid table name %s", name);
+    Preconditions.checkArgument(name != null && !name.isEmpty(), "Invalid table name: null or empty");
+    Preconditions.checkArgument(namespace != null, "Invalid Namespace: null");
     this.namespace = namespace;
     this.name = name;
   }