yifeih · yifeih · Mar 13, 2019 · Feb 28, 2019 · Feb 28, 2019 · Mar 2, 2019
diff --git a/build.gradle b/build.gradle
@@ -18,7 +18,11 @@
  */
 
 buildscript {
-  repositories { jcenter() }
+  repositories {
+      jcenter()
+      maven { url  "http://palantir.bintray.com/releases" }
+  }
+
   dependencies {
     classpath 'com.github.jengelman.gradle.plugins:shadow:2.0.0'
     classpath 'com.netflix.nebula:gradle-aggregate-javadocs-plugin:2.2.+'
@@ -51,6 +55,7 @@ subprojects {
   apply plugin: 'nebula.maven-base-publish'
 
   repositories {
+    maven { url  "http://palantir.bintray.com/releases" }
     mavenCentral()
     mavenLocal()
   }
@@ -66,16 +71,16 @@ subprojects {
   }
 
   ext {
-    hadoopVersion = '2.7.3'
+    hadoopVersion = '3.2.0-palantir.8'
     avroVersion = '1.8.2'
     orcVersion = '1.4.2'
-    parquetVersion = '1.10.0'
+    parquetVersion = '1.12.0-palantir.4'
     hiveVersion = '3.1.0'
 
-    jacksonVersion = '2.6.7'
+    jacksonVersion = '2.7.3'
 
     scalaVersion = '2.11'
-    sparkVersion = '2.4.0'
+    sparkVersion = '3.0.0-palantir.18'
   }
 
   sourceCompatibility = '1.8'

diff --git a/core/src/main/java/com/netflix/iceberg/RemoveSnapshots.java b/core/src/main/java/com/netflix/iceberg/RemoveSnapshots.java
@@ -26,7 +26,6 @@
 import com.netflix.iceberg.exceptions.RuntimeIOException;
 import com.netflix.iceberg.util.Tasks;
 import com.netflix.iceberg.util.ThreadPools;
-import io.netty.util.internal.ConcurrentSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
@@ -162,7 +161,7 @@ public void commit() {
       }
     }
 
-    Set<String> filesToDelete = new ConcurrentSet<>();
+    Set<String> filesToDelete = Sets.newConcurrentHashSet();
     Tasks.foreach(allManifests)
         .noRetry().suppressFailureWhenFinished()
         .executeWith(ThreadPools.getWorkerPool())

diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -21,4 +21,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-bin.zip
diff --git a/parquet/src/main/java/com/netflix/iceberg/parquet/ParquetWriter.java b/parquet/src/main/java/com/netflix/iceberg/parquet/ParquetWriter.java
@@ -52,7 +52,8 @@ class ParquetWriter<T> implements FileAppender<T>, Closeable {
       .hiddenImpl("org.apache.parquet.hadoop.ColumnChunkPageWriteStore",
           CodecFactory.BytesCompressor.class,
           MessageType.class,
-          ByteBufferAllocator.class)
+          ByteBufferAllocator.class,
+          int.class)
       .build();
 
   private static final DynMethods.UnboundMethod flushToWriter = DynMethods
@@ -159,7 +160,7 @@ private void startRowGroup() {
     this.recordCount = 0;
 
     PageWriteStore pageStore = pageStoreCtor.newInstance(
-        compressor, parquetSchema, props.getAllocator());
+        compressor, parquetSchema, props.getAllocator(), props.getColumnIndexTruncateLength());
 
     this.flushPageStoreToWriter = flushToWriter.bind(pageStore);
     this.writeStore = props.newColumnWriteStore(parquetSchema, pageStore);

diff --git a/pig/src/main/java/com/netflix/iceberg/pig/IcebergPigInputFormat.java b/pig/src/main/java/com/netflix/iceberg/pig/IcebergPigInputFormat.java
@@ -36,7 +36,7 @@
 import com.netflix.iceberg.types.Type;
 import com.netflix.iceberg.types.TypeUtil;
 import com.netflix.iceberg.types.Types;
-import org.apache.commons.lang.SerializationUtils;
+import org.apache.commons.lang3.SerializationUtils;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;

diff --git a/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java
@@ -20,32 +20,18 @@
 package com.netflix.iceberg.spark.source;
 
 import com.google.common.base.Preconditions;
-import com.netflix.iceberg.FileFormat;
-import com.netflix.iceberg.Schema;
 import com.netflix.iceberg.Table;
 import com.netflix.iceberg.hadoop.HadoopTables;
-import com.netflix.iceberg.spark.SparkSchemaUtil;
-import com.netflix.iceberg.types.CheckCompatibility;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.sources.DataSourceRegister;
-import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
-import org.apache.spark.sql.sources.v2.ReadSupport;
-import org.apache.spark.sql.sources.v2.WriteSupport;
-import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
-import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
+import org.apache.spark.sql.sources.v2.TableProvider;
 import org.apache.spark.sql.types.StructType;
-import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 
-import static com.netflix.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
-import static com.netflix.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
-
-public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister {
+public class IcebergSource implements TableProvider, DataSourceRegister {
 
   private SparkSession lazySpark = null;
   private Configuration lazyConf = null;
@@ -55,45 +41,6 @@ public String shortName() {
     return "iceberg";
   }
 
-  @Override
-  public DataSourceReader createReader(DataSourceOptions options) {
-    Configuration conf = new Configuration(lazyBaseConf());
-    Table table = getTableAndResolveHadoopConfiguration(options, conf);
-    return new Reader(table);
-  }
-
-  @Override
-  public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
-                                                   DataSourceOptions options) {
-    Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);
-    Configuration conf = new Configuration(lazyBaseConf());
-    Table table = getTableAndResolveHadoopConfiguration(options, conf);
-
-    Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
-    List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
-    if (!errors.isEmpty()) {
-      StringBuilder sb = new StringBuilder();
-      sb.append("Cannot write incompatible dataframe to table with schema:\n")
-          .append(table.schema()).append("\nProblems:");
-      for (String error : errors) {
-        sb.append("\n* ").append(error);
-      }
-      throw new IllegalArgumentException(sb.toString());
-    }
-
-    Optional<String> formatOption = options.get("iceberg.write.format");
-    FileFormat format;
-    if (formatOption.isPresent()) {
-      format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH));
-    } else {
-      format = FileFormat.valueOf(table.properties()
-          .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
-          .toUpperCase(Locale.ENGLISH));
-    }
-
-    return Optional.of(new Writer(table, format));
-  }
-
   protected Table findTable(DataSourceOptions options, Configuration conf) {
     Optional<String> location = options.get("path");
     Preconditions.checkArgument(location.isPresent(),
@@ -136,4 +83,16 @@ private static void mergeIcebergHadoopConfs(
         .filter(key -> key.startsWith("iceberg.hadoop"))
         .forEach(key -> baseConf.set(key.replaceFirst("iceberg.hadoop", ""), options.get(key)));
   }
+
+  @Override
+  public org.apache.spark.sql.sources.v2.Table getTable(DataSourceOptions options) {
+    Configuration conf = new Configuration(lazyBaseConf());
+    Table table = getTableAndResolveHadoopConfiguration(options, conf);
+    return new IcebergSparkTable(table);
+  }
+
+  @Override
+  public org.apache.spark.sql.sources.v2.Table getTable(DataSourceOptions options, StructType schema) {
+    throw new UnsupportedOperationException("Schema should never be passed into an iceberg table");
+  }
 }
diff --git a/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSparkTable.java b/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSparkTable.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package com.netflix.iceberg.spark.source;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.netflix.iceberg.FileFormat;
+import com.netflix.iceberg.Table;
+import com.netflix.iceberg.expressions.Expression;
+import com.netflix.iceberg.spark.SparkFilters;
+import com.netflix.iceberg.spark.SparkSchemaUtil;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.sources.Filter;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
+import org.apache.spark.sql.sources.v2.SupportsBatchRead;
+import org.apache.spark.sql.sources.v2.SupportsBatchWrite;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.sources.v2.writer.BatchWrite;
+import org.apache.spark.sql.sources.v2.writer.SupportsSaveMode;
+import org.apache.spark.sql.sources.v2.writer.WriteBuilder;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.List;
+import java.util.Locale;
+import java.util.Optional;
+
+import static com.netflix.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
+import static com.netflix.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT;
+
+public class IcebergSparkTable implements SupportsBatchRead, SupportsBatchWrite {
+
+  private final Table table;
+
+  public IcebergSparkTable(Table table) {
+    this.table = table;
+  }
+
+  @Override
+  public ScanBuilder newScanBuilder(DataSourceOptions options) {
+    return new IcebergReaderBuilder(table);
+  }
+
+  @Override
+  public WriteBuilder newWriteBuilder(DataSourceOptions options) {
+    Optional<String> formatOption = options.get("iceberg.write.format");
+    if (formatOption.isPresent()) {
+      return new IcebergWriterBuilder(table, FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH)));
+    }
+    return new IcebergWriterBuilder(
+        table,
+        FileFormat.valueOf(table.properties()
+            .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
+            .toUpperCase(Locale.ENGLISH)));
+  }
+
+  @Override
+  public String name() {
+    return table.location();
+  }
+
+  @Override
+  public StructType schema() {
+    return SparkSchemaUtil.convert(table.schema());
+  }
+
+
+  private static class IcebergWriterBuilder implements WriteBuilder,
+      SupportsSaveMode {
+    private final Table table;
+    private final FileFormat fileFormat;
+
+    public IcebergWriterBuilder(Table table, FileFormat fileFormat) {
+      this.table = table;
+      this.fileFormat = fileFormat;
+    }
+
+    @Override
+    public BatchWrite buildForBatch() {
+      return new Writer(table, fileFormat);
+    }
+
+    @Override
+    public WriteBuilder mode(SaveMode mode) {
+      Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);
+      return this;
+    }
+  }
+
+  private static class IcebergReaderBuilder implements ScanBuilder,
+      SupportsPushDownFilters,
+      SupportsPushDownRequiredColumns {
+
+    private static final Filter[] NO_FILTERS = new Filter[0];
+
+    private final Table table;
+    private Filter[] pushedFilters = NO_FILTERS;
+    private List<Expression> filterExpressions = Lists.newArrayList();
+    private StructType requestedSchema = null;
+
+    public IcebergReaderBuilder(Table table) {
+      this.table = table;
+    }
+
+    @Override
+    public Filter[] pushFilters(Filter[] filters) {
+      List<Expression> expressions = Lists.newArrayListWithExpectedSize(filters.length);
+      List<Filter> pushed = Lists.newArrayListWithExpectedSize(filters.length);
+
+      for (Filter filter : filters) {
+        Expression expr = SparkFilters.convert(filter);
+        if (expr != null) {
+          expressions.add(expr);
+          pushed.add(filter);
+        }
+      }
+
+      this.filterExpressions = expressions;
+      this.pushedFilters = pushed.toArray(new Filter[0]);
+
+      // Spark doesn't support residuals per task, so return all filters
+      // to get Spark to handle record-level filtering
+      return filters;
+    }
+
+    @Override
+    public Filter[] pushedFilters() {
+      return pushedFilters;
+    }
+
+    @Override
+    public void pruneColumns(StructType requiredSchema) {
+      this.requestedSchema = requiredSchema;
+    }
+
+    @Override
+    public Scan build() {
+      return new Reader(table, filterExpressions, requestedSchema);
+    }
+  }
+}