apache · RussellSpitzer · Sep 3, 2020 · rdblue · Sep 9, 2020 · kbendick
diff --git a/...ava/org/apache/iceberg/ManifestEntry.java → ...ava/org/apache/iceberg/ManifestEntry.java b/...ava/org/apache/iceberg/ManifestEntry.java → ...ava/org/apache/iceberg/ManifestEntry.java
@@ -1,20 +1,15 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 package org.apache.iceberg;
@@ -25,7 +20,7 @@
 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
 
-interface ManifestEntry<F extends ContentFile<F>> {
+public interface ManifestEntry<F extends ContentFile<F>> {
   enum Status {
     EXISTING(0),
     ADDED(1),

diff --git a/api/src/main/java/org/apache/iceberg/ManifestProcessor.java b/api/src/main/java/org/apache/iceberg/ManifestProcessor.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.io.Serializable;
+import java.util.function.BiFunction;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+
+public abstract class ManifestProcessor implements Serializable {
+    public abstract <T extends ContentFile<T>> Iterable<CloseableIterable<ManifestEntry<T>>> readManifests(final Iterable<ManifestFile> fromIterable,
+        BiFunction<ManifestFile, FileIO, CloseableIterable<ManifestEntry<T>>> reader);
+
+    /**
+     * A Helper interface for making lambdas transform into the correct type for the ManfiestProcessor
+     * @param <T> The ManifestEntry Type being read from Manifest files
+     */
+    public interface Func<T extends ContentFile<T>> extends BiFunction<ManifestFile, FileIO,
+        CloseableIterable<ManifestEntry<T>>>, Serializable {}
+
+}
diff --git a/api/src/main/java/org/apache/iceberg/TableScan.java b/api/src/main/java/org/apache/iceberg/TableScan.java
@@ -94,6 +94,11 @@ public interface TableScan {
    */
   TableScan includeColumnStats();
 
+  /**
+   * Doc doc doc
+   */
+  TableScan withManifestProcessor(ManifestProcessor processor);
+
   /**
    * Create a new {@link TableScan} from this that will read the given data columns. This produces
    * an expected schema that includes all fields that are either selected or used by this scan's

diff --git a/core/src/main/java/org/apache/iceberg/BaseTableScan.java b/core/src/main/java/org/apache/iceberg/BaseTableScan.java
@@ -54,6 +54,7 @@ abstract class BaseTableScan implements TableScan {
   private final Table table;
   private final Schema schema;
   private final TableScanContext context;
+  protected ManifestProcessor manifestProcessor;
 
   protected BaseTableScan(TableOperations ops, Table table, Schema schema) {
     this(ops, table, schema, new TableScanContext());
@@ -64,6 +65,7 @@ protected BaseTableScan(TableOperations ops, Table table, Schema schema, TableSc
     this.table = table;
     this.schema = schema;
     this.context = context;
+    this.manifestProcessor = new LocalManifestProcessor(table.io());
   }
 
   protected TableOperations tableOps() {
@@ -131,6 +133,9 @@ public TableScan useSnapshot(long scanSnapshotId) {
         ops, table, schema, context.useSnapshotId(scanSnapshotId));
   }
 
+
+
+
   @Override
   public TableScan asOfTime(long timestampMillis) {
     Preconditions.checkArgument(context.snapshotId() == null,
@@ -275,6 +280,12 @@ public String toString() {
         .toString();
   }
 
+  @Override
+  public TableScan withManifestProcessor(ManifestProcessor processor) {
+    this.manifestProcessor = processor;
+    return this;
+  }
+
   /**
    * To be able to make refinements {@link #select(Collection)} and {@link #caseSensitive(boolean)} in any order,
    * we resolve the schema to be projected lazily here.

diff --git a/core/src/main/java/org/apache/iceberg/DataTableScan.java b/core/src/main/java/org/apache/iceberg/DataTableScan.java
@@ -86,7 +86,7 @@ public CloseableIterable<FileScanTask> planFiles(TableOperations ops, Snapshot s
       manifestGroup = manifestGroup.planWith(ThreadPools.getWorkerPool());
     }
 
-    return manifestGroup.planFiles();
+    return manifestGroup.withProcessor(this.manifestProcessor).planFiles();
   }
 
   @Override

diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java
@@ -19,14 +19,15 @@
 
 package org.apache.iceberg;
 
+import java.io.Serializable;
 import org.apache.avro.generic.IndexedRecord;
 import org.apache.avro.specific.SpecificData;
 import org.apache.iceberg.avro.AvroSchemaUtil;
 import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
 import org.apache.iceberg.types.Types;
 
 class GenericManifestEntry<F extends ContentFile<F>>
-    implements ManifestEntry<F>, IndexedRecord, SpecificData.SchemaConstructable, StructLike {
+    implements ManifestEntry<F>, IndexedRecord, SpecificData.SchemaConstructable, StructLike, Serializable {
   private final org.apache.avro.Schema schema;
   private Status status = Status.EXISTING;
   private Long snapshotId = null;

diff --git a/core/src/main/java/org/apache/iceberg/LocalManifestProcessor.java b/core/src/main/java/org/apache/iceberg/LocalManifestProcessor.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.function.BiFunction;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+
+public class LocalManifestProcessor extends ManifestProcessor {
+    private final FileIO io;
+
+    public LocalManifestProcessor(FileIO io){
+        this.io = io;
+    }
+
+    @Override
+    public <T extends ContentFile<T>> Iterable<CloseableIterable<ManifestEntry<T>>> readManifests(
+        Iterable<ManifestFile> fromIterable,
+        BiFunction<ManifestFile, FileIO, CloseableIterable<ManifestEntry<T>>> reader) {
+        return CloseableIterable.transform(
+            CloseableIterable.withNoopClose(fromIterable), manifestFile -> reader.apply(manifestFile, io));
+    }
+}
diff --git a/core/src/main/java/org/apache/iceberg/ManifestGroup.java b/core/src/main/java/org/apache/iceberg/ManifestGroup.java
@@ -25,7 +25,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
-import java.util.function.BiFunction;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import org.apache.iceberg.expressions.Evaluator;
@@ -36,6 +35,7 @@
 import org.apache.iceberg.expressions.ResidualEvaluator;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.relocated.com.google.common.base.Function;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
@@ -61,6 +61,7 @@ class ManifestGroup {
   private List<String> columns;
   private boolean caseSensitive;
   private ExecutorService executorService;
+  private ManifestProcessor manifestProcessor;
 
   ManifestGroup(FileIO io, Iterable<ManifestFile> manifests) {
     this(io,
@@ -82,6 +83,12 @@ class ManifestGroup {
     this.caseSensitive = true;
     this.manifestPredicate = m -> true;
     this.manifestEntryPredicate = e -> true;
+    this.manifestProcessor = new LocalManifestProcessor(io);
+  }
+
+  ManifestGroup withProcessor(ManifestProcessor processor){
+    this.manifestProcessor = processor;
+    return this;
   }
 
   ManifestGroup specsById(Map<Integer, PartitionSpec> newSpecsById) {
@@ -169,20 +176,26 @@ public CloseableIterable<FileScanTask> planFiles() {
       select(Streams.concat(columns.stream(), ManifestReader.STATS_COLUMNS.stream()).collect(Collectors.toList()));
     }
 
-    Iterable<CloseableIterable<FileScanTask>> tasks = entries((manifest, entries) -> {
-      int specId = manifest.partitionSpecId();
-      PartitionSpec spec = specsById.get(specId);
-      String schemaString = SchemaParser.toJson(spec.schema());
-      String specString = PartitionSpecParser.toJson(spec);
-      ResidualEvaluator residuals = residualCache.get(specId);
-      if (dropStats) {
-        return CloseableIterable.transform(entries, e -> new BaseFileScanTask(
-            e.file().copyWithoutStats(), deleteFiles.forEntry(e), schemaString, specString, residuals));
-      } else {
-        return CloseableIterable.transform(entries, e -> new BaseFileScanTask(
-            e.file().copy(), deleteFiles.forEntry(e), schemaString, specString, residuals));
-      }
-    });
+    LoadingCache<Integer, SpecCacheEntry> specCache = Caffeine.newBuilder().build(
+        specId -> {
+          PartitionSpec spec = specsById.get(specId);
+          return new SpecCacheEntry(SchemaParser.toJson(spec.schema()), PartitionSpecParser.toJson(spec),
+              residualCache.get(specId));
+        }
+    );
+
+    //Todo Make this cleaner (maybe go back to old method of two traversals?  Make api for BaseFileScanTask?
+    //This will have different performance characteristics than the old version since we are doing a look for every
+    // entry, but I think this will probably end up being essentially a noop with branch prediction since we look up
+    // the same thing over and over in order.
+    Iterable<CloseableIterable<FileScanTask>> tasks = entries(entries ->
+        CloseableIterable.transform(entries, e ->
+        {
+          SpecCacheEntry cached = specCache.get(e.file().specId());
+          DataFile file = (dropStats) ? e.file().copyWithoutStats() : e.file().copy();
+          return new BaseFileScanTask(file, deleteFiles.forEntry(e), cached.schemaString, cached.specString,
+              cached.residuals);
+        }));
 
     if (executorService != null) {
       return new ParallelIterable<>(tasks, executorService);
@@ -200,11 +213,32 @@ public CloseableIterable<FileScanTask> planFiles() {
    * @return a CloseableIterable of manifest entries.
    */
   public CloseableIterable<ManifestEntry<DataFile>> entries() {
-    return CloseableIterable.concat(entries((manifest, entries) -> entries));
+    return CloseableIterable.concat(entries(entry -> entry));
+  }
+
+  /*
+  Generating the lambda in a static context allows us to ignore the serializability of this class
+   */
+  private static ManifestProcessor.Func<DataFile> generateManifestProcessorFunc(Map<Integer, PartitionSpec> specsById,
+      Expression dataFilter, Expression partitionFilter, boolean caseSensitive, List<String> columns, boolean ignoreDeleted) {
+    return (ManifestProcessor.Func<DataFile>) (manifest, processorIO) -> {
+      ManifestReader<DataFile> reader = ManifestFiles.read(manifest, processorIO, specsById)
+          .filterRows(dataFilter)
+          .filterPartitions(partitionFilter)
+          .caseSensitive(caseSensitive)
+          .select(columns);
+
+      CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries();
+      if (ignoreDeleted) {
+        entries = reader.liveEntries();
+      }
+      return entries;
+    };
   }
 
   private <T> Iterable<CloseableIterable<T>> entries(
-      BiFunction<ManifestFile, CloseableIterable<ManifestEntry<DataFile>>, CloseableIterable<T>> entryFn) {
+      Function<CloseableIterable<ManifestEntry<DataFile>>, CloseableIterable<T>> entryFn) {
+
     LoadingCache<Integer, ManifestEvaluator> evalCache = specsById == null ?
         null : Caffeine.newBuilder().build(specId -> {
           PartitionSpec spec = specsById.get(specId);
@@ -236,20 +270,13 @@ private <T> Iterable<CloseableIterable<T>> entries(
 
     matchingManifests = Iterables.filter(matchingManifests, manifestPredicate::test);
 
-    return Iterables.transform(
-        matchingManifests,
-        manifest -> {
-          ManifestReader<DataFile> reader = ManifestFiles.read(manifest, io, specsById)
-              .filterRows(dataFilter)
-              .filterPartitions(partitionFilter)
-              .caseSensitive(caseSensitive)
-              .select(columns);
-
-          CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries();
-          if (ignoreDeleted) {
-            entries = reader.liveEntries();
-          }
+    Iterable< CloseableIterable<ManifestEntry<DataFile>>> fileReader =
+        manifestProcessor.readManifests(matchingManifests, generateManifestProcessorFunc(specsById, dataFilter,
+            partitionFilter, caseSensitive, columns, ignoreDeleted));
 
+    return Iterables.transform(
+        fileReader,
+        entries -> {
           if (ignoreExisting) {
             entries = CloseableIterable.filter(entries,
                 entry -> entry.status() != ManifestEntry.Status.EXISTING);
@@ -261,7 +288,21 @@ private <T> Iterable<CloseableIterable<T>> entries(
           }
 
           entries = CloseableIterable.filter(entries, manifestEntryPredicate);
-          return entryFn.apply(manifest, entries);
+          return entryFn.apply(entries);
         });
   }
+
+
+  private class SpecCacheEntry {
+    private final String schemaString;
+    private final String specString;
+    private final ResidualEvaluator residuals;
+
+    SpecCacheEntry(String schemaString, String specString, ResidualEvaluator residuals) {
+      this.schemaString = schemaString;
+      this.specString = specString;
+      this.residuals = residuals;
+    }
+  }
+
 }
diff --git a/core/src/main/java/org/apache/iceberg/util/SerializablePredicate.java b/core/src/main/java/org/apache/iceberg/util/SerializablePredicate.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg.util;
+
+import java.io.Serializable;
+import java.util.Objects;
+import java.util.function.Predicate;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+
+public interface SerializablePredicate<T> extends Predicate<T>, Serializable {
+
+  @Override
+  default SerializablePredicate<T> and(Predicate<? super T> other) {
+    Objects.requireNonNull(other);
+    Preconditions.checkArgument(other instanceof SerializablePredicate);
+    return (T x) -> this.test(x) && other.test(x);
+  }
+}