-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-30662][ML][PySpark] ALS/MLP extend HasBlockSize #27389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,7 +54,8 @@ import org.apache.spark.util.random.XORShiftRandom | |
| /** | ||
| * Common params for ALS and ALSModel. | ||
| */ | ||
| private[recommendation] trait ALSModelParams extends Params with HasPredictionCol { | ||
| private[recommendation] trait ALSModelParams extends Params with HasPredictionCol | ||
| with HasBlockSize { | ||
| /** | ||
| * Param for the column name for user ids. Ids must be integers. Other | ||
| * numeric types are supported for this column, but will be cast to integers as long as they | ||
|
|
@@ -125,6 +126,8 @@ private[recommendation] trait ALSModelParams extends Params with HasPredictionCo | |
|
|
||
| /** @group expertGetParam */ | ||
| def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT) | ||
|
|
||
| setDefault(blockSize -> 4096) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just realized that I should set Default of |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -288,6 +291,15 @@ class ALSModel private[ml] ( | |
| @Since("2.2.0") | ||
| def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value) | ||
|
|
||
| /** | ||
| * Set block size for stacking input data in matrices. | ||
| * Default is 4096. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I think this is about to be a default of 1024, after the very latest PR from @zhengruifeng is merged. I think it's probably good to go so will merge it.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comment. I actually saw the default changed to 1024 in that PR, but I want the default to be 4096, that's why I set it explicitly in line 675 in the Estimator I want the default to be 4096 because the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, sounds fine. |
||
| * | ||
| * @group expertSetParam | ||
| */ | ||
| @Since("3.0.0") | ||
| def setBlockSize(value: Int): this.type = set(blockSize, value) | ||
|
|
||
| private val predict = udf { (featuresA: Seq[Float], featuresB: Seq[Float]) => | ||
| if (featuresA != null && featuresB != null) { | ||
| var dotProduct = 0.0f | ||
|
|
@@ -351,7 +363,7 @@ class ALSModel private[ml] ( | |
| */ | ||
| @Since("2.2.0") | ||
| def recommendForAllUsers(numItems: Int): DataFrame = { | ||
| recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems) | ||
| recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -366,7 +378,7 @@ class ALSModel private[ml] ( | |
| @Since("2.3.0") | ||
| def recommendForUserSubset(dataset: Dataset[_], numItems: Int): DataFrame = { | ||
| val srcFactorSubset = getSourceFactorSubset(dataset, userFactors, $(userCol)) | ||
| recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems) | ||
| recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -377,7 +389,7 @@ class ALSModel private[ml] ( | |
| */ | ||
| @Since("2.2.0") | ||
| def recommendForAllItems(numUsers: Int): DataFrame = { | ||
| recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers) | ||
| recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -392,7 +404,7 @@ class ALSModel private[ml] ( | |
| @Since("2.3.0") | ||
| def recommendForItemSubset(dataset: Dataset[_], numUsers: Int): DataFrame = { | ||
| val srcFactorSubset = getSourceFactorSubset(dataset, itemFactors, $(itemCol)) | ||
| recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers) | ||
| recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -441,11 +453,12 @@ class ALSModel private[ml] ( | |
| dstFactors: DataFrame, | ||
| srcOutputColumn: String, | ||
| dstOutputColumn: String, | ||
| num: Int): DataFrame = { | ||
| num: Int, | ||
| blockSize: Int): DataFrame = { | ||
| import srcFactors.sparkSession.implicits._ | ||
|
|
||
| val srcFactorsBlocked = blockify(srcFactors.as[(Int, Array[Float])]) | ||
| val dstFactorsBlocked = blockify(dstFactors.as[(Int, Array[Float])]) | ||
| val srcFactorsBlocked = blockify(srcFactors.as[(Int, Array[Float])], blockSize) | ||
| val dstFactorsBlocked = blockify(dstFactors.as[(Int, Array[Float])], blockSize) | ||
| val ratings = srcFactorsBlocked.crossJoin(dstFactorsBlocked) | ||
| .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])] | ||
| .flatMap { case (srcIter, dstIter) => | ||
|
|
@@ -483,11 +496,10 @@ class ALSModel private[ml] ( | |
|
|
||
| /** | ||
| * Blockifies factors to improve the efficiency of cross join | ||
| * TODO: SPARK-20443 - expose blockSize as a param? | ||
| */ | ||
| private def blockify( | ||
| factors: Dataset[(Int, Array[Float])], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need default blockSize in this method? i.e.,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right. No need to have the default blockSize any more. I will update the code. Thanks! |
||
| blockSize: Int = 4096): Dataset[Seq[(Int, Array[Float])]] = { | ||
| blockSize: Int): Dataset[Seq[(Int, Array[Float])]] = { | ||
| import factors.sparkSession.implicits._ | ||
| factors.mapPartitions(_.grouped(blockSize)) | ||
| } | ||
|
|
@@ -654,6 +666,15 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] | |
| @Since("2.2.0") | ||
| def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value) | ||
|
|
||
| /** | ||
| * Set block size for stacking input data in matrices. | ||
| * Default is 4096. | ||
| * | ||
| * @group expertSetParam | ||
| */ | ||
| @Since("3.0.0") | ||
| def setBlockSize(value: Int): this.type = set(blockSize, value) | ||
|
|
||
| /** | ||
| * Sets both numUserBlocks and numItemBlocks to the specific value. | ||
| * | ||
|
|
@@ -683,7 +704,7 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] | |
| instr.logDataset(dataset) | ||
| instr.logParams(this, rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, userCol, | ||
| itemCol, ratingCol, predictionCol, maxIter, regParam, nonnegative, checkpointInterval, | ||
| seed, intermediateStorageLevel, finalStorageLevel) | ||
| seed, intermediateStorageLevel, finalStorageLevel, blockSize) | ||
|
|
||
| val (userFactors, itemFactors) = ALS.train(ratings, rank = $(rank), | ||
| numUserBlocks = $(numUserBlocks), numItemBlocks = $(numItemBlocks), | ||
|
|
@@ -694,7 +715,8 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] | |
| checkpointInterval = $(checkpointInterval), seed = $(seed)) | ||
| val userDF = userFactors.toDF("id", "features") | ||
| val itemDF = itemFactors.toDF("id", "features") | ||
| val model = new ALSModel(uid, $(rank), userDF, itemDF).setParent(this) | ||
| val model = new ALSModel(uid, $(rank), userDF, itemDF).setBlockSize($(blockSize)) | ||
| .setParent(this) | ||
| copyValues(model) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't actually go away because it's in HasBlockSize ? just checking the API doesn't change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes,
getBlockSizedoesn't go away because it is inHasBlockSize.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default value of blocksize in MLP is 128, so explicitly
setDefault(blockSize -> 128)in MLP?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is set in
MultilayerPerceptronParamsat line 83