From a21d8b07a130fc47e76a96935f272cb720ff1250 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 17 Aug 2017 21:45:16 -0700 Subject: [PATCH] [SPARK-21778][SQL] Simpler Dataset.sample API in Scala / Java --- .../scala/org/apache/spark/sql/Dataset.scala | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index a9887eb95279..615686ccbe2b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1848,11 +1848,43 @@ class Dataset[T] private[sql]( Except(logicalPlan, other.logicalPlan) } + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement), + * using a user-supplied seed. + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * @param seed Seed for sampling. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double, seed: Long): Dataset[T] = { + sample(withReplacement = false, fraction = fraction, seed = seed) + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement). + * + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. + * + * @note This is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. + * + * @group typedrel + * @since 2.3.0 + */ + def sample(fraction: Double): Dataset[T] = { + sample(withReplacement = false, fraction = fraction) + } + /** * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * @param seed Seed for sampling. * * @note This is NOT guaranteed to provide exactly the fraction of the count @@ -1871,7 +1903,7 @@ class Dataset[T] private[sql]( * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed. * * @param withReplacement Sample with replacement or not. - * @param fraction Fraction of rows to generate. + * @param fraction Fraction of rows to generate, range [0.0, 1.0]. * * @note This is NOT guaranteed to provide exactly the fraction of the total count * of the given [[Dataset]].