apache
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 14 additions & 11 deletions b/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala‎
Lines changed: 14 additions & 12 deletions b/‎core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/metrics/InputMetricsSuite.scala‎
Lines changed: 25 additions & 2 deletions b/‎core/src/test/scala/org/apache/spark/metrics/InputMetricsSuite.scala‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎docs/sql-programming-guide.md‎
Lines changed: 13 additions & 5 deletions b/‎docs/sql-programming-guide.md‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/rdd.py‎
Lines changed: 0 additions & 3 deletions b/‎python/pyspark/rdd.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎python/pyspark/rddsampler.py‎
Lines changed: 5 additions & 6 deletions b/‎python/pyspark/rddsampler.py‎
Lines changed: 5 additions & 6 deletions
@@ -211,20 +211,11 @@ class HadoopRDD[K, V](
 
       val split = theSplit.asInstanceOf[HadoopPartition]
       logInfo("Input split: " + split.inputSplit)
-      var reader: RecordReader[K, V] = null
       val jobConf = getJobConf()
-      val inputFormat = getInputFormat(jobConf)
-      HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(createTime),
-        context.stageId, theSplit.index, context.attemptId.toInt, jobConf)
-      reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
-
-      // Register an on-task-completion callback to close the input stream.
-      context.addTaskCompletionListener{ context => closeIfNeeded() }
-      val key: K = reader.createKey()
-      val value: V = reader.createValue()
 
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
-      // Find a function that will return the FileSystem bytes read by this thread.
+      // Find a function that will return the FileSystem bytes read by this thread. Do this before
+      // creating RecordReader, because RecordReader's constructor might read some bytes
       val bytesReadCallback = if (split.inputSplit.value.isInstanceOf[FileSplit]) {
         SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(
           split.inputSplit.value.asInstanceOf[FileSplit].getPath, jobConf)
@@ -234,6 +225,18 @@ class HadoopRDD[K, V](
       if (bytesReadCallback.isDefined) {
         context.taskMetrics.inputMetrics = Some(inputMetrics)
       }
+
+      var reader: RecordReader[K, V] = null
+      val inputFormat = getInputFormat(jobConf)
+      HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(createTime),
+        context.stageId, theSplit.index, context.attemptId.toInt, jobConf)
+      reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
+
+      // Register an on-task-completion callback to close the input stream.
+      context.addTaskCompletionListener{ context => closeIfNeeded() }
+      val key: K = reader.createKey()
+      val value: V = reader.createValue()
+
       var recordsSinceMetricsUpdate = 0
 
       override def getNext() = {
 
@@ -107,20 +107,10 @@ class NewHadoopRDD[K, V](
       val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
       val conf = confBroadcast.value.value
-      val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0)
-      val hadoopAttemptContext = newTaskAttemptContext(conf, attemptId)
-      val format = inputFormatClass.newInstance
-      format match {
-        case configurable: Configurable =>
-          configurable.setConf(conf)
-        case _ =>
-      }
-      val reader = format.createRecordReader(
-        split.serializableHadoopSplit.value, hadoopAttemptContext)
-      reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
 
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
-      // Find a function that will return the FileSystem bytes read by this thread.
+      // Find a function that will return the FileSystem bytes read by this thread. Do this before
+      // creating RecordReader, because RecordReader's constructor might read some bytes
       val bytesReadCallback = if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit]) {
         SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(
           split.serializableHadoopSplit.value.asInstanceOf[FileSplit].getPath, conf)
@@ -131,6 +121,18 @@ class NewHadoopRDD[K, V](
         context.taskMetrics.inputMetrics = Some(inputMetrics)
       }
 
+      val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0)
+      val hadoopAttemptContext = newTaskAttemptContext(conf, attemptId)
+      val format = inputFormatClass.newInstance
+      format match {
+        case configurable: Configurable =>
+          configurable.setConf(conf)
+        case _ =>
+      }
+      val reader = format.createRecordReader(
+        split.serializableHadoopSplit.value, hadoopAttemptContext)
+      reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+
       // Register an on-task-completion callback to close the input stream.
       context.addTaskCompletionListener(context => close())
       var havePair = false
 
@@ -27,7 +27,7 @@ import scala.collection.mutable.ArrayBuffer
 import java.io.{FileWriter, PrintWriter, File}
 
 class InputMetricsSuite extends FunSuite with SharedSparkContext {
-  test("input metrics when reading text file") {
+  test("input metrics when reading text file with single split") {
     val file = new File(getClass.getSimpleName + ".txt")
     val pw = new PrintWriter(new FileWriter(file))
     pw.println("some stuff")
@@ -48,6 +48,29 @@ class InputMetricsSuite extends FunSuite with SharedSparkContext {
     // Wait for task end events to come in
     sc.listenerBus.waitUntilEmpty(500)
     assert(taskBytesRead.length == 2)
-    assert(taskBytesRead.sum == file.length())
+    assert(taskBytesRead.sum >= file.length())
+  }
+
+  test("input metrics when reading text file with multiple splits") {
+    val file = new File(getClass.getSimpleName + ".txt")
+    val pw = new PrintWriter(new FileWriter(file))
+    for (i <- 0 until 10000) {
+      pw.println("some stuff")
+    }
+    pw.close()
+    file.deleteOnExit()
+
+    val taskBytesRead = new ArrayBuffer[Long]()
+    sc.addSparkListener(new SparkListener() {
+      override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
+        taskBytesRead += taskEnd.taskMetrics.inputMetrics.get.bytesRead
+      }
+    })
+    sc.textFile("file://" + file.getAbsolutePath, 2).count()
+
+    // Wait for task end events to come in
+    sc.listenerBus.waitUntilEmpty(500)
+    assert(taskBytesRead.length == 2)
+    assert(taskBytesRead.sum >= file.length())
   }
 }
@@ -582,19 +582,27 @@ Configuration of Parquet can be done using the `setConf` method on SQLContext or
 </tr>
 <tr>
   <td><code>spark.sql.parquet.cacheMetadata</code></td>
-  <td>false</td>
+  <td>true</td>
   <td>
     Turns on caching of Parquet schema metadata.  Can speed up querying of static data.
   </td>
 </tr>
 <tr>
   <td><code>spark.sql.parquet.compression.codec</code></td>
-  <td>snappy</td>
+  <td>gzip</td>
   <td>
     Sets the compression codec use when writing Parquet files. Acceptable values include: 
     uncompressed, snappy, gzip, lzo.
   </td>
 </tr>
+<tr>
+  <td><code>spark.sql.hive.convertMetastoreParquet</code></td>
+  <td>true</td>
+  <td>
+    When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of the built in
+    support.
+  </td>
+</tr>
 </table>
 
 ## JSON Datasets
@@ -815,15 +823,15 @@ Configuration of in-memory caching can be done using the `setConf` method on SQL
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
   <td><code>spark.sql.inMemoryColumnarStorage.compressed</code></td>
-  <td>false</td>
+  <td>true</td>
   <td>
     When set to true Spark SQL will automatically select a compression codec for each column based
     on statistics of the data.
   </td>
 </tr>
 <tr>
   <td><code>spark.sql.inMemoryColumnarStorage.batchSize</code></td>
-  <td>1000</td>
+  <td>10000</td>
   <td>
     Controls the size of batches for columnar caching.  Larger batch sizes can improve memory utilization
     and compression, but risk OOMs when caching data.
@@ -841,7 +849,7 @@ that these options will be deprecated in future release as more optimizations ar
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
   <tr>
     <td><code>spark.sql.autoBroadcastJoinThreshold</code></td>
-    <td>10000</td>
+    <td>10485760 (10 MB)</td>
     <td>
       Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when
       performing a join.  By setting this value to -1 broadcasting can be disabled.  Note that currently
 
@@ -1359,7 +1359,7 @@
         <activeByDefault>false</activeByDefault>
       </activation>
       <properties>
-        <hive.version>0.13.1</hive.version>
+        <hive.version>0.13.1a</hive.version>
         <hive.version.short>0.13.1</hive.version.short>
         <derby.version>10.10.1.1</derby.version>
       </properties>
 
@@ -316,9 +316,6 @@ def sample(self, withReplacement, fraction, seed=None):
         """
         Return a sampled subset of this RDD (relies on numpy and falls back
         on default random generator if numpy is unavailable).
-
-        >>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP
-        [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]
         """
         assert fraction >= 0.0, "Negative fraction value: %s" % fraction
         return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
 
@@ -40,14 +40,13 @@ def __init__(self, withReplacement, seed=None):
     def initRandomGenerator(self, split):
         if self._use_numpy:
             import numpy
-            self._random = numpy.random.RandomState(self._seed)
+            self._random = numpy.random.RandomState(self._seed ^ split)
         else:
-            self._random = random.Random(self._seed)
+            self._random = random.Random(self._seed ^ split)
 
-        for _ in range(0, split):
-            # discard the next few values in the sequence to have a
-            # different seed for the different splits
-            self._random.randint(0, 2 ** 32 - 1)
+        # mixing because the initial seeds are close to each other
+        for _ in xrange(10):
+            self._random.randint(0, 1)
 
         self._split = split
         self._rand_initialized = True