apache · rdblue · Dec 11, 2018 · Nov 17, 2018 · Nov 26, 2018 · Nov 17, 2018
diff --git a/core/src/main/java/com/netflix/iceberg/TableProperties.java b/core/src/main/java/com/netflix/iceberg/TableProperties.java
@@ -66,4 +66,9 @@ public class TableProperties {
   public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false;
 
   public static final String OBJECT_STORE_PATH = "write.object-storage.path";
+
+  // This only applies to files written after this property is set. Files previously written aren't relocated to
+  // reflect this parameter.
+  // If not set, defaults to a "data" folder underneath the root path of the table.
+  public static final String WRITE_NEW_DATA_LOCATION = "write.folder-storage.path";
 }
diff --git a/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java
@@ -23,10 +23,12 @@
 import com.netflix.iceberg.FileFormat;
 import com.netflix.iceberg.Schema;
 import com.netflix.iceberg.Table;
+import com.netflix.iceberg.TableProperties;
 import com.netflix.iceberg.hadoop.HadoopTables;
 import com.netflix.iceberg.spark.SparkSchemaUtil;
 import com.netflix.iceberg.types.CheckCompatibility;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.sources.DataSourceRegister;
@@ -89,7 +91,11 @@ public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct
           .toUpperCase(Locale.ENGLISH));
     }
 
-    return Optional.of(new Writer(table, lazyConf(), format));
+    String dataLocation = options.get(TableProperties.WRITE_NEW_DATA_LOCATION)
+        .orElse(table.properties().getOrDefault(
+            TableProperties.WRITE_NEW_DATA_LOCATION,
+            new Path(new Path(table.location()), "data").toString()));
+    return Optional.of(new Writer(table, lazyConf(), format, dataLocation));
   }
 
   protected Table findTable(DataSourceOptions options) {

diff --git a/spark/src/main/java/com/netflix/iceberg/spark/source/Writer.java b/spark/src/main/java/com/netflix/iceberg/spark/source/Writer.java
@@ -19,7 +19,6 @@
 
 package com.netflix.iceberg.spark.source;
 
-import com.google.common.base.Joiner;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
@@ -94,16 +93,18 @@ class Writer implements DataSourceWriter, SupportsWriteInternalRow {
   private final Table table;
   private final Configuration conf;
   private final FileFormat format;
+  private final String dataLocation;
 
-  Writer(Table table, Configuration conf, FileFormat format) {
+  Writer(Table table, Configuration conf, FileFormat format, String dataLocation) {
     this.table = table;
     this.conf = conf;
     this.format = format;
+    this.dataLocation = dataLocation;
   }
 
   @Override
   public DataWriterFactory<InternalRow> createInternalRowWriterFactory() {
-    return new WriterFactory(table.spec(), format, dataLocation(), table.properties(), conf);
+    return new WriterFactory(table.spec(), format, dataLocation, table.properties(), conf);
   }
 
   @Override
@@ -167,10 +168,6 @@ private int propertyAsInt(String property, int defaultValue) {
     return defaultValue;
   }
 
-  private String dataLocation() {
-    return new Path(new Path(table.location()), "data").toString();
-  }
-
   @Override
   public String toString() {
     return String.format("IcebergWrite(table=%s, type=%s, format=%s)",

diff --git a/spark/src/test/java/com/netflix/iceberg/spark/source/TestDataFrameWrites.java b/spark/src/test/java/com/netflix/iceberg/spark/source/TestDataFrameWrites.java
@@ -32,21 +32,25 @@
 import com.netflix.iceberg.spark.data.AvroDataTest;
 import com.netflix.iceberg.spark.data.RandomData;
 import com.netflix.iceberg.spark.data.SparkAvroReader;
+import com.netflix.iceberg.types.Types;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrameWriter;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.BeforeClass;
+import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 import java.io.File;
 import java.io.IOException;
+import java.net.URI;
 import java.util.List;
 
 import static com.netflix.iceberg.spark.SparkSchemaUtil.convert;
@@ -56,6 +60,9 @@
 @RunWith(Parameterized.class)
 public class TestDataFrameWrites extends AvroDataTest {
   private static final Configuration CONF = new Configuration();
+  private static final Schema BASIC_SCHEMA = new Schema(
+      Types.NestedField.required(0, "id", Types.LongType.get()),
+      Types.NestedField.optional(1, "data", Types.ListType.ofOptional(2, Types.StringType.get())));
 
   private String format = null;
 
@@ -91,23 +98,55 @@ public static void stopSpark() {
 
   @Override
   protected void writeAndValidate(Schema schema) throws IOException {
+    writeAndValidateWithLocations(schema, false, false);
+  }
+
+  @Test
+  public void testWrite_overridingDataLocation_tablePropertyOnly() throws IOException {
+    writeAndValidateWithLocations(BASIC_SCHEMA, true, false);
+  }
+
+  @Test
+  public void testWrite_overridingDataLocation_sourceOptionOnly() throws IOException {
+    writeAndValidateWithLocations(BASIC_SCHEMA, false, true);
+  }
+
+  @Test
+  public void testWrite_overridingDataLocation_sourceOptionTakesPrecedence() throws IOException {
+    writeAndValidateWithLocations(BASIC_SCHEMA, true, true);
+  }
+
+  private void writeAndValidateWithLocations(
+      Schema schema,
+      boolean setTablePropertyDataLocation,
+      boolean setWriterOptionDataLocation) throws IOException {
     File parent = temp.newFolder("parquet");
     File location = new File(parent, "test");
     Assert.assertTrue("Mkdir should succeed", location.mkdirs());
 
+    File tablePropertyDataLocation = new File(parent, "test-table-property-data-dir");
+    Assert.assertTrue("Mkdir should succeed", tablePropertyDataLocation.mkdirs());
+    File writerPropertyDataLocation = new File(parent, "test-source-option-data-dir");
+    Assert.assertTrue("Mkdir should succeed", writerPropertyDataLocation.mkdirs());
+
     HadoopTables tables = new HadoopTables(CONF);
     Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());
     Schema tableSchema = table.schema(); // use the table schema because ids are reassigned
 
     table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();
+    if (setTablePropertyDataLocation) {
+      table.updateProperties().set(
+          TableProperties.WRITE_NEW_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit();
+    }
 
     List<Record> expected = RandomData.generateList(tableSchema, 100, 0L);
     Dataset<Row> df = createDataset(expected, tableSchema);
+    DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");
+    if (setWriterOptionDataLocation) {
+      writer = writer.option(TableProperties.WRITE_NEW_DATA_LOCATION, writerPropertyDataLocation.getAbsolutePath());
+    }
 
-    df.write()
-        .format("iceberg")
-        .mode("append")
-        .save(location.toString());
+    writer.save(location.toString());
 
     table.refresh();
 
@@ -121,6 +160,22 @@ protected void writeAndValidate(Schema schema) throws IOException {
     for (int i = 0; i < expected.size(); i += 1) {
       assertEqualsSafe(tableSchema.asStruct(), expected.get(i), actual.get(i));
     }
+
+    File expectedDataDir;
+    if (setWriterOptionDataLocation) {
+      expectedDataDir = writerPropertyDataLocation;
+    } else if (setTablePropertyDataLocation) {
+      expectedDataDir = tablePropertyDataLocation;
+    } else {
+      expectedDataDir = new File(location, "data");
+    }
+    table.currentSnapshot().addedFiles().forEach(dataFile ->
+        Assert.assertTrue(
+            String.format(
+                "File should have the parent directory %s, but has: %s.",
+                expectedDataDir.getAbsolutePath(),
+                dataFile.path()),
+            URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
   }
 
   private Dataset<Row> createDataset(List<Record> records, Schema schema) throws IOException {

diff --git a/spark/src/test/java/com/netflix/iceberg/spark/source/TestParquetWrite.java b/spark/src/test/java/com/netflix/iceberg/spark/source/TestParquetWrite.java
@@ -23,6 +23,7 @@
 import com.netflix.iceberg.PartitionSpec;
 import com.netflix.iceberg.Schema;
 import com.netflix.iceberg.Table;
+import com.netflix.iceberg.TableProperties;
 import com.netflix.iceberg.hadoop.HadoopTables;
 import com.netflix.iceberg.types.Types;
 import org.apache.hadoop.conf.Configuration;