From 1bbd48cc4c1242195d46976d8d0382d9f09bbc25 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 13 Sep 2016 11:47:42 -0400 Subject: [PATCH 01/45] First Commit of LSH function implementation. Implement basic Estimator-Model class hierarchy to make RandomProjection works. --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 152 ++++++++++++++++++ .../spark/ml/lsh/RandomProjection.scala | 74 +++++++++ .../spark/ml/lsh/RandomProjectionSuite.scala | 42 +++++ 3 files changed, 268 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala new file mode 100644 index 0000000000000..b0418df5caa2a --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.lsh + +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.linalg.{Vector, VectorUDT} +import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{StructField, StructType} + +/** + * Params for [[LSH]]. + */ +private[ml] trait LSHParams extends HasInputCol with HasOutputCol { + /** + * Param for output dimension. + * + * @group param + */ + final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension", + ParamValidators.gt(0)) + + /** @group getParam */ + final def getOutputDim: Int = $(outputDim) + + setDefault(outputDim -> 1) + + setDefault(outputCol -> "lsh_output") + + /** + * Transform the Schema for LSH + * @param schema The schema of the input dataset without outputCol + * @return A derived schema with outputCol added + */ + final def transformLSHSchema(schema: StructType): StructType = { + val outputFields = schema.fields :+ + StructField($(outputCol), new VectorUDT, nullable = false) + StructType(outputFields) + } +} + +/** + * Model produced by [[LSH]]. + */ +abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] + extends Model[T] with LSHParams { + override def copy(extra: ParamMap): T = defaultCopy(extra) + + protected var modelDataset: DataFrame = null + + /** + * :: DeveloperApi :: + * + * The hash function of LSH, mapping a predefined KeyType to a Vector + * @return The mapping of LSH function. + */ + protected[this] val hashFunction: KeyType => Vector + + + /** + * Transforms the input dataset. + */ + override def transform(dataset: Dataset[_]): DataFrame = { + transformSchema(dataset.schema, logging = true) + val transformUDF = udf(hashFunction, new VectorUDT) + modelDataset = dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) + modelDataset + } + + /** + * :: DeveloperApi :: + * + * Check transform validity and derive the output schema from the input schema. + * + * Typical implementation should first conduct verification on schema change and parameter + * validity, including complex parameter interaction checks. + */ + override def transformSchema(schema: StructType): StructType = { + transformLSHSchema(schema) + } + + /** + * Get the dataset inside the model. This is used in approximate similarity join or when user + * wants to run their own algorithm on the LSH dataset. + * @return The dataset inside the model + */ + def getModelDataset: Dataset[_] = modelDataset +} + +abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with LSHParams { + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + /** @group setParam */ + def setOutputDim(value: Int): this.type = set(outputDim, value) + + /** + * :: DeveloperApi :: + * + * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have + * different initial setting, developer needs to define how their LSHModel is created instead of + * using reflection in this abstract class. + * @param inputDim the input dimension of input dataset + * @return A new LSHModel instance without any params + */ + protected[this] def createRawLSHModel(inputDim: Int): T + + override def copy(extra: ParamMap): Estimator[T] = defaultCopy(extra) + + /** + * Fits a model to the input data. + */ + override def fit(dataset: Dataset[_]): T = { + val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size + val model = createRawLSHModel(inputDim).setParent(this) + copyValues(model) + model.transform(dataset) + model + } + + /** + * :: DeveloperApi :: + * + * Check transform validity and derive the output schema from the input schema. + * + * Typical implementation should first conduct verification on schema change and parameter + * validity, including complex parameter interaction checks. + */ + override def transformSchema(schema: StructType): StructType = { + transformLSHSchema(schema) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala new file mode 100644 index 0000000000000..96f7e79ee51f4 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.lsh + +import scala.util.Random + +import breeze.linalg.normalize + +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} +import org.apache.spark.ml.util.Identifiable + +/** + * Params for [[RandomProjection]]. + */ +private[ml] trait RandomProjectionParams extends Params { + val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", + "the length of each hash bucket", ParamValidators.gt(0)) +} + +class RandomProjectionModel( + override val uid: String, + val randUnitVectors: Array[breeze.linalg.Vector[Double]]) + extends LSHModel[Vector, RandomProjectionModel] with RandomProjectionParams { + + override protected[this] val hashFunction: (Vector) => Vector = { + key: Vector => { + val hashValues: Array[Double] = randUnitVectors.map({ + randUnitVector => Math.floor(key.asBreeze.dot(randUnitVector) / $(bucketLength)) + }) + Vectors.dense(hashValues) + } + } +} + +class RandomProjection(override val uid: String) extends LSH[Vector, RandomProjectionModel] + with RandomProjectionParams { + + private[this] var inputDim = -1 + + private[this] lazy val randUnitVectors: Array[breeze.linalg.Vector[Double]] = { + Array.fill($(outputDim)) { + val randArray = Array.fill(inputDim)(Random.nextGaussian()) + normalize(breeze.linalg.Vector(randArray)) + } + } + + def this() = { + this(Identifiable.randomUID("random projection")) + } + + /** @group setParam */ + def setBucketLength(value: Double): this.type = set(bucketLength, value) + + override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { + this.inputDim = inputDim + new RandomProjectionModel(uid, randUnitVectors) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala new file mode 100644 index 0000000000000..3892d22237dc3 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.lsh + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { + test("RandomProjection") { + val data = { + for (i <- -20 until 20; j <- -20 until 20) yield Vectors.dense(i.toDouble, j.toDouble) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + // Project from 2 dimensional Euclidean Space to 10 dimensions + val rp = new RandomProjection() + .setOutputDim(10) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(3.0) + + val model = rp.fit(df) + + model.getModelDataset.show() + } +} From ca46d82214a3ebc38c0bc69a460f6cfcb6550d99 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 13 Sep 2016 12:09:03 -0400 Subject: [PATCH 02/45] Implementation of Approximate Nearest Neighbors. Add distCol as another model parameters --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 71 ++++++++++++++++++- .../spark/ml/lsh/RandomProjection.scala | 5 ++ .../spark/ml/lsh/RandomProjectionSuite.scala | 1 + 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index b0418df5caa2a..20772919101e9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -19,11 +19,11 @@ package org.apache.spark.ml.lsh import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} -import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.types.{DataTypes, StructField, StructType} /** * Params for [[LSH]]. @@ -37,13 +37,25 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension", ParamValidators.gt(0)) + /** + * Param for distance column name. + * + * @group param + */ + final val distCol: Param[String] = new Param[String](this, "distCol", "distance column name") + /** @group getParam */ final def getOutputDim: Int = $(outputDim) + /** @group getParam */ + final def getDistCol: String = $(distCol) + setDefault(outputDim -> 1) setDefault(outputCol -> "lsh_output") + setDefault(distCol -> "lsh_distance") + /** * Transform the Schema for LSH * @param schema The schema of the input dataset without outputCol @@ -73,6 +85,30 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] */ protected[this] val hashFunction: KeyType => Vector + /** + * :: DeveloperApi :: + * + * Calculate the distance between two different keys using the distance metric corresponding + * to the hashFunction + * @param x One of the point in the metric space + * @param y Another the point in the metric space + * @return The distance between x and y in double + */ + protected[this] def keyDistance(x: KeyType, y: KeyType): Double + + /** + * :: DeveloperApi :: + * + * Calculate the distance between two different hash Vectors. By default, the distance is the + * minimum distance of two hash values in any dimension. + * + * @param x One of the hash vector + * @param y Another hash vector + * @return The distance between hash vectors x and y in double + */ + protected[this] def hashDistance(x: Vector, y: Vector): Double = { + (x.asBreeze - y.asBreeze).toArray.map(math.abs).min + } /** * Transforms the input dataset. @@ -102,6 +138,34 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @return The dataset inside the model */ def getModelDataset: Dataset[_] = modelDataset + + /** + * Given a large dataset and an item, approximately find at most k items which have the closest + * distance to the item. + * @param key The key to hash for the item + * @param k The maximum number of items closest to the key + * @return A dataset containing at most k items closest to the key. + */ + def approxNearestNeighbors(key: KeyType, k: Int = 1): Dataset[_] = { + if (k < 1) { + throw new Exception(s"Invalid number of nearest neighbors $k") + } + // Get Hash Value of the key v + val keyHash = hashFunction(key) + + // In the origin dataset, find the hash value u that is closest to v + val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) + val nearestHashDataset = modelDataset.select(min(hashDistUDF(col($(outputCol))))) + val nearestHashValue = nearestHashDataset.collect()(0)(0).asInstanceOf[Double] + + // Filter the dataset where the hash value equals to u + val modelSubset = modelDataset.filter(hashDistUDF(col($(outputCol))) === nearestHashValue) + + // Get the top k nearest neighbor by their distance to the key + val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) + val modelSubsetWithDistCol = modelSubset.withColumn($(distCol), keyDistUDF(col($(inputCol)))) + modelSubsetWithDistCol.sort($(distCol)).limit(k) + } } abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with LSHParams { @@ -114,6 +178,9 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with /** @group setParam */ def setOutputDim(value: Int): this.type = set(outputDim, value) + /** @group setParam */ + def setDistCol(value: String): this.type = set(distCol, value) + /** * :: DeveloperApi :: * diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala index 96f7e79ee51f4..67c4084cb3f84 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.lsh import scala.util.Random +import breeze.linalg.functions.euclideanDistance import breeze.linalg.normalize import org.apache.spark.ml.linalg.{Vector, Vectors} @@ -46,6 +47,10 @@ class RandomProjectionModel( Vectors.dense(hashValues) } } + + override protected[this] def keyDistance(x: Vector, y: Vector): Double = { + euclideanDistance(x.asBreeze, y.asBreeze) + } } class RandomProjection(override val uid: String) extends LSH[Vector, RandomProjectionModel] diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index 3892d22237dc3..d417c41c44838 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -38,5 +38,6 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val model = rp.fit(df) model.getModelDataset.show() + model.approxNearestNeighbors(Vectors.dense(1.2, 3.4), k = 20).show() } } From c693f5b2deec621bf8dbf617d1fb2367bf8b3397 Mon Sep 17 00:00:00 2001 From: Yunni Date: Thu, 15 Sep 2016 01:48:35 -0400 Subject: [PATCH 03/45] Implement approxSimilarityJoin(). Remove modelDataset and distCol as discussed in the Design Doc. --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 120 +++++++++++++----- .../spark/ml/lsh/RandomProjectionSuite.scala | 5 +- 2 files changed, 88 insertions(+), 37 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index 20772919101e9..fb19627294b93 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -17,13 +17,16 @@ package org.apache.spark.ml.lsh +import scala.util.Random + import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} -import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.sql._ +import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataTypes, StructField, StructType} +import org.apache.spark.sql.types._ /** * Params for [[LSH]]. @@ -37,25 +40,13 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension", ParamValidators.gt(0)) - /** - * Param for distance column name. - * - * @group param - */ - final val distCol: Param[String] = new Param[String](this, "distCol", "distance column name") - /** @group getParam */ final def getOutputDim: Int = $(outputDim) - /** @group getParam */ - final def getDistCol: String = $(distCol) - setDefault(outputDim -> 1) setDefault(outputCol -> "lsh_output") - setDefault(distCol -> "lsh_distance") - /** * Transform the Schema for LSH * @param schema The schema of the input dataset without outputCol @@ -74,9 +65,6 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] extends Model[T] with LSHParams { override def copy(extra: ParamMap): T = defaultCopy(extra) - - protected var modelDataset: DataFrame = null - /** * :: DeveloperApi :: * @@ -116,8 +104,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(hashFunction, new VectorUDT) - modelDataset = dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) - modelDataset + dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } /** @@ -132,26 +119,23 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] transformLSHSchema(schema) } - /** - * Get the dataset inside the model. This is used in approximate similarity join or when user - * wants to run their own algorithm on the LSH dataset. - * @return The dataset inside the model - */ - def getModelDataset: Dataset[_] = modelDataset - /** * Given a large dataset and an item, approximately find at most k items which have the closest * distance to the item. * @param key The key to hash for the item * @param k The maximum number of items closest to the key - * @return A dataset containing at most k items closest to the key. + * @param distCol The column to store the distance between pairs + * @return A dataset containing at most k items closest to the key. A distCol is added to show + * the distance between each record and the key. */ - def approxNearestNeighbors(key: KeyType, k: Int = 1): Dataset[_] = { + def approxNearestNeighbors(dataset: Dataset[_], key: KeyType, k: Int = 1, + distCol: String = "distance"): Dataset[_] = { if (k < 1) { throw new Exception(s"Invalid number of nearest neighbors $k") } // Get Hash Value of the key v val keyHash = hashFunction(key) + val modelDataset = transform(dataset) // In the origin dataset, find the hash value u that is closest to v val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) @@ -163,8 +147,79 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] // Get the top k nearest neighbor by their distance to the key val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) - val modelSubsetWithDistCol = modelSubset.withColumn($(distCol), keyDistUDF(col($(inputCol)))) - modelSubsetWithDistCol.sort($(distCol)).limit(k) + val modelSubsetWithDistCol = modelSubset.withColumn(distCol, keyDistUDF(col($(inputCol)))) + modelSubsetWithDistCol.sort(distCol).limit(k) + } + + /** + * Preprocess step for approximate similarity join. Transform and explode the outputCol to + * explodeCols. + * @param dataset The dataset to transform and explode. + * @param explodeCols The alias for the exploded columns, must be a seq of two strings. + * @return A dataset containing idCol, inputCol and explodeCols + */ + private[this] def processDataset(dataset: Dataset[_], explodeCols: Seq[String]): Dataset[_] = { + if (explodeCols.size != 2) { + throw new Exception("explodeCols must be two strings.") + } + val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, + MapType(DataTypes.IntegerType, DataTypes.DoubleType)) + transform(dataset) + .select(col("*"), explode(vectorToMap(col($(outputCol)))).as(explodeCols)) + } + + /** + * Recreate a column using the same column name but different attribute id. Used in approximate + * similarity join. + * @param dataset The dataset where a column need to recreate + * @param colName The name of the column to recreate + * @param tmpColName A temporary column name which does not conflict with existing columns + * @return + */ + private[this] def recreateCol(dataset: Dataset[_], colName: String, + tmpColName: String): Dataset[_] = { + dataset + .withColumnRenamed(colName, tmpColName) + .withColumn(colName, col(tmpColName)) + .drop(tmpColName) + } + + /** + * Join two dataset to approximately find all pairs of records whose distance are smaller + * than the threshold. + * @param datasetA One of the datasets to join + * @param datasetB Another dataset to join + * @param threshold The threshold for the distance of record pairs + * @param distCol The column to store the distance between pairs + * @return A joined dataset containing pairs of records. A distCol is added to show the distance + * between each pair of records. + */ + def approxSimilarityJoin(datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double, + distCol: String = "distance"): Dataset[_] = { + + val explodeCols = Seq("lsh#entry", "lsh#hashValue") + val explodedA = processDataset(datasetA, explodeCols) + + // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. + val explodedB = if (datasetA != datasetB) { + processDataset(datasetB, explodeCols) + } else { + val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}") + processDataset(recreatedB, explodeCols) + } + + // Do a hash join on where the exploded hash values are equal. + val joinedDataset = explodedA.join(explodedB, explodeCols) + .drop(explodeCols: _*) + + // Add a new column to store the distance of the two records. + val distUDF = udf((x: KeyType, y: KeyType) => keyDistance(x, y), DataTypes.DoubleType) + val joinedDatasetWithDist = joinedDataset.select(col("*"), + distUDF(explodedA($(inputCol)), explodedB($(inputCol))).as(distCol) + ) + + // Filter the joined datasets where the distance are smaller than the threshold. + joinedDatasetWithDist.distinct().filter(col(distCol) < threshold) } } @@ -178,9 +233,6 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with /** @group setParam */ def setOutputDim(value: Int): this.type = set(outputDim, value) - /** @group setParam */ - def setDistCol(value: String): this.type = set(distCol, value) - /** * :: DeveloperApi :: * @@ -201,8 +253,6 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size val model = createRawLSHModel(inputDim).setParent(this) copyValues(model) - model.transform(dataset) - model } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index d417c41c44838..4c2071a685e44 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -37,7 +37,8 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val model = rp.fit(df) - model.getModelDataset.show() - model.approxNearestNeighbors(Vectors.dense(1.2, 3.4), k = 20).show() + model.transform(df).show() + model.approxNearestNeighbors(df, Vectors.dense(1.2, 3.4), k = 20).show() + model.approxSimilarityJoin(df, df, 1.1).filter("distance != 0.0").show() } } From c9ee0f9222f76ee2bc77e1a0e056274444a4af5e Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 19 Sep 2016 00:10:10 -0400 Subject: [PATCH 04/45] Add test utility method to check LSH property. Tested on random projection. --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 4 +- .../org/apache/spark/ml/lsh/LSHTest.scala | 69 +++++++++++++++++++ .../spark/ml/lsh/RandomProjectionSuite.scala | 34 ++++++--- 3 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index fb19627294b93..786c8e5817e34 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -82,7 +82,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @param y Another the point in the metric space * @return The distance between x and y in double */ - protected[this] def keyDistance(x: KeyType, y: KeyType): Double + protected[ml] def keyDistance(x: KeyType, y: KeyType): Double /** * :: DeveloperApi :: @@ -94,7 +94,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @param y Another hash vector * @return The distance between hash vectors x and y in double */ - protected[this] def hashDistance(x: Vector, y: Vector): Double = { + protected[ml] def hashDistance(x: Vector, y: Vector): Double = { (x.asBreeze - y.asBreeze).toArray.map(math.abs).min } diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala new file mode 100644 index 0000000000000..d94b7d4ed4848 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.lsh + +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.DataTypes + +private[ml] object LSHTest { + /** + * For any locality sensitive function h in a metric space, we meed to verify whether + * the following property is satisfied. + * + * There exist d1, d2, p1, p2, so that for any two elements e1 and e2, + * If dist(e1, e2) >= dist1, then Pr{h(x) == h(y)} >= p1 + * If dist(e1, e2) <= dist2, then Pr{h(x) != h(y)} <= p2 + * + * This is called locality sensitive property. This method checks the property on an + * existing dataset and calculate the probabilities. + * (https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Definition) + * + * @param dataset The dataset to verify the locality sensitive hashing property. + * @param lsh The lsh instance to perform the hashing + * @param dist1 Distance threshold for false positive + * @param dist2 Distance threshold for false negative + * @tparam KeyType The input key type of LSH + * @tparam T The type of lsh instance + * @return A tuple of two doubles, representing the false positive and false negative rate + */ + def checkLSHProperty[KeyType, T <: LSHModel[KeyType, T]] + (dataset: Dataset[_], lsh: LSH[KeyType, T], dist1: Double, dist2: Double): (Double, Double) = { + val model = lsh.fit(dataset) + val inputCol = model.getInputCol + val outputCol = model.getOutputCol + val transformedData = model.transform(dataset) + + // Perform a cross join and label each pair of same_bucket and distance + val pairs = transformedData.as("a").crossJoin(transformedData.as("b")) + val distUDF = udf((x: KeyType, y: KeyType) => model.keyDistance(x, y), DataTypes.DoubleType) + val sameBucket = udf((x: Vector, y: Vector) => model.hashDistance(x, y) == 0.0, + DataTypes.BooleanType) + val result = pairs + .withColumn("same_bucket", sameBucket(col(s"a.$outputCol"), col(s"b.$outputCol"))) + .withColumn("distance", distUDF(col(s"a.$inputCol"), col(s"b.$inputCol"))) + + // Compute the probabilities based on the join result + val positive = result.filter(col("same_bucket")) + val negative = result.filter(!col("same_bucket")) + val falsePositiveCount = positive.filter(col("distance") > dist1).count().toDouble + val falseNegativeCount = negative.filter(col("distance") < dist2).count().toDouble + (falsePositiveCount / positive.count(), falseNegativeCount / negative.count()) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index 4c2071a685e44..e4b2e1eae7715 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -24,21 +24,39 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { test("RandomProjection") { val data = { - for (i <- -20 until 20; j <- -20 until 20) yield Vectors.dense(i.toDouble, j.toDouble) + for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) } val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - // Project from 2 dimensional Euclidean Space to 10 dimensions + // Project from 2 dimensional Euclidean Space to 1 dimensions val rp = new RandomProjection() - .setOutputDim(10) + .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") - .setBucketLength(3.0) + .setBucketLength(1.0) + + val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, rp, 8.0, 2.0) + assert(falsePositive < 0.1) + assert(falseNegative < 0.1) + } - val model = rp.fit(df) + test("RandomProjection with high dimension data") { + val numDim = 100 + val data = { + for (i <- 0 until numDim; j <- Seq(-2, -1, 1, 2)) + yield Vectors.sparse(numDim, Seq((i, j.toDouble))) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + // Project from 100 dimensional Euclidean Space to 10 dimensions + val rp = new RandomProjection() + .setOutputDim(10) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(2.5) - model.transform(df).show() - model.approxNearestNeighbors(df, Vectors.dense(1.2, 3.4), k = 20).show() - model.approxSimilarityJoin(df, df, 1.1).filter("distance != 0.0").show() + val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, rp, 3.0, 2.0) + assert(falsePositive < 0.1) + assert(falseNegative < 0.1) } } From fc838e0de0fd560a69b4a60bec5411c00842b4bb Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 19 Sep 2016 00:55:39 -0400 Subject: [PATCH 05/45] Add testing utility for approximate nearest neighbor. Run the testing on random projection. --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 1 + .../org/apache/spark/ml/lsh/LSHTest.scala | 32 +++++++++++++++++-- .../spark/ml/lsh/RandomProjectionSuite.scala | 18 +++++++++++ 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index 786c8e5817e34..15b22e534dda5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -122,6 +122,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] /** * Given a large dataset and an item, approximately find at most k items which have the closest * distance to the item. + * @param dataset the dataset to look for the key * @param key The key to hash for the item * @param k The maximum number of items closest to the key * @param distCol The column to store the distance between pairs diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala index d94b7d4ed4848..108e9b22cb1dd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala @@ -35,12 +35,12 @@ private[ml] object LSHTest { * existing dataset and calculate the probabilities. * (https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Definition) * - * @param dataset The dataset to verify the locality sensitive hashing property. + * @param dataset The dataset to verify the locality sensitive hashing property. * @param lsh The lsh instance to perform the hashing * @param dist1 Distance threshold for false positive * @param dist2 Distance threshold for false negative - * @tparam KeyType The input key type of LSH - * @tparam T The type of lsh instance + * @tparam KeyType The input key type of LSH + * @tparam T The class type of lsh * @return A tuple of two doubles, representing the false positive and false negative rate */ def checkLSHProperty[KeyType, T <: LSHModel[KeyType, T]] @@ -66,4 +66,30 @@ private[ml] object LSHTest { val falseNegativeCount = negative.filter(col("distance") < dist2).count().toDouble (falsePositiveCount / positive.count(), falseNegativeCount / negative.count()) } + + /** + * Check and compute the precision and recall of approximate nearest neighbors + * @param lsh The lsh instance + * @param dataset the dataset to look for the key + * @param key The key to hash for the item + * @param k The maximum number of items closest to the key + * @tparam KeyType The input key type of LSH + * @tparam T The class type of lsh + * @return A tuple of two doubles, representing precision and recall rate + */ + def checkApproxNearestNeighbors[KeyType, T <: LSHModel[KeyType, T]] + (lsh: LSH[KeyType, T], dataset: Dataset[_], key: KeyType, k: Int): (Double, Double) = { + val model = lsh.fit(dataset) + + // Compute expected + val distUDF = udf((x: KeyType) => model.keyDistance(x, key), DataTypes.DoubleType) + val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k) + + // Compute actual + val actual = model.approxNearestNeighbors(dataset, key, k) + + // Compute precision and recall + val correctCount = expected.join(actual, model.getInputCol).count().toDouble + (correctCount / expected.count(), correctCount / actual.count()) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index e4b2e1eae7715..2627b5f2932d9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -59,4 +59,22 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(falsePositive < 0.1) assert(falseNegative < 0.1) } + + test("approxNearestNeighbors for random projection") { + val data = { + for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + val key = Vectors.dense(1.2, 3.4) + + val rp = new RandomProjection() + .setOutputDim(2) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(4.0) + + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 10) + assert(precision >= 0.7) + assert(recall >= 0.7) + } } From aa138e8db4fab8c6cd33d465895b65c8519c88b9 Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 19 Sep 2016 02:14:37 -0400 Subject: [PATCH 06/45] Add testing utility for approximate similarity join. Run the testing on random projection. --- .../org/apache/spark/ml/lsh/LSHTest.scala | 29 +++++++++++++ .../spark/ml/lsh/RandomProjectionSuite.scala | 42 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala index 108e9b22cb1dd..d36e12692fa6f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala @@ -92,4 +92,33 @@ private[ml] object LSHTest { val correctCount = expected.join(actual, model.getInputCol).count().toDouble (correctCount / expected.count(), correctCount / actual.count()) } + + /** + * Check and compute the precision and recall of approximate similarity join + * @param lsh The lsh instance + * @param datasetA One of the datasets to join + * @param datasetB Another dataset to join + * @param threshold The threshold for the distance of record pairs + * @tparam KeyType The input key type of LSH + * @tparam T The class type of lsh + * @return A tuple of two doubles, representing precision and recall rate + */ + def checkApproxSimilarityJoin[KeyType, T <: LSHModel[KeyType, T]] + (lsh: LSH[KeyType, T], datasetA: Dataset[_], datasetB: Dataset[_], + threshold: Double): (Double, Double) = { + val model = lsh.fit(datasetA) + val inputCol = model.getInputCol + + // Compute expected + val distUDF = udf((x: KeyType, y: KeyType) => model.keyDistance(x, y), DataTypes.DoubleType) + val expected = datasetA.as("a").crossJoin(datasetB.as("b")) + .filter(distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")) < threshold) + + // Compute actual + val actual = model.approxSimilarityJoin(datasetA, datasetB, threshold) + + // Compute precision and recall + val correctCount = actual.filter(col("distance") < threshold).count().toDouble + (correctCount / actual.count(), correctCount / expected.count()) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index 2627b5f2932d9..cc3611a62105a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.ml.lsh +import breeze.numerics.{cos, sin} +import breeze.numerics.constants.Pi + import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -77,4 +80,43 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(precision >= 0.7) assert(recall >= 0.7) } + + test("approxSimilarityJoin for random projection on different dataset") { + val dataA = { + for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) + } + val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") + + val dataB = { + for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i)) + } + val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + + val rp = new RandomProjection() + .setOutputDim(2) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(4.0) + + val (precision, recall) = LSHTest.checkApproxSimilarityJoin(rp, dfA, dfB, 1.0) + assert(precision == 1.0) + assert(recall >= 0.9) + } + + test("approxSimilarityJoin for self join") { + val data = { + for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i)) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val rp = new RandomProjection() + .setOutputDim(2) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(4.0) + + val (precision, recall) = LSHTest.checkApproxSimilarityJoin(rp, df, df, 3.0) + assert(precision == 1.0) + assert(recall >= 0.7) + } } From bbcbcf0a757bd15fe0e9f4bc182d35308737c320 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Mon, 19 Sep 2016 15:31:49 -0700 Subject: [PATCH 07/45] Code review comments. A new unit test of k nearest neighbor for large k --- .../main/scala/org/apache/spark/ml/lsh/LSH.scala | 7 +++---- .../apache/spark/ml/lsh/RandomProjection.scala | 15 +++++++-------- .../scala/org/apache/spark/ml/lsh/LSHTest.scala | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index 15b22e534dda5..b7e5ac44cefdb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -131,9 +131,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] */ def approxNearestNeighbors(dataset: Dataset[_], key: KeyType, k: Int = 1, distCol: String = "distance"): Dataset[_] = { - if (k < 1) { - throw new Exception(s"Invalid number of nearest neighbors $k") - } + assert(k > 0, "The number of nearest neighbors cannot be less than 1") // Get Hash Value of the key v val keyHash = hashFunction(key) val modelDataset = transform(dataset) @@ -202,6 +200,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] val explodedA = processDataset(datasetA, explodeCols) // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. + // TODO: Remove recreateCol logic once SPARK-17154 is resolved. val explodedB = if (datasetA != datasetB) { processDataset(datasetB, explodeCols) } else { @@ -220,7 +219,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] ) // Filter the joined datasets where the distance are smaller than the threshold. - joinedDatasetWithDist.distinct().filter(col(distCol) < threshold) + joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala index 67c4084cb3f84..e77a1a87b34c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala @@ -19,10 +19,9 @@ package org.apache.spark.ml.lsh import scala.util.Random -import breeze.linalg.functions.euclideanDistance import breeze.linalg.normalize -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable @@ -36,20 +35,20 @@ private[ml] trait RandomProjectionParams extends Params { class RandomProjectionModel( override val uid: String, - val randUnitVectors: Array[breeze.linalg.Vector[Double]]) + val randUnitVectors: Array[Vector]) extends LSHModel[Vector, RandomProjectionModel] with RandomProjectionParams { override protected[this] val hashFunction: (Vector) => Vector = { key: Vector => { val hashValues: Array[Double] = randUnitVectors.map({ - randUnitVector => Math.floor(key.asBreeze.dot(randUnitVector) / $(bucketLength)) + randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength)) }) Vectors.dense(hashValues) } } - override protected[this] def keyDistance(x: Vector, y: Vector): Double = { - euclideanDistance(x.asBreeze, y.asBreeze) + override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { + Math.sqrt(Vectors.sqdist(x, y)) } } @@ -58,10 +57,10 @@ class RandomProjection(override val uid: String) extends LSH[Vector, RandomProje private[this] var inputDim = -1 - private[this] lazy val randUnitVectors: Array[breeze.linalg.Vector[Double]] = { + private[this] lazy val randUnitVectors: Array[Vector] = { Array.fill($(outputDim)) { val randArray = Array.fill(inputDim)(Random.nextGaussian()) - normalize(breeze.linalg.Vector(randArray)) + Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala index d36e12692fa6f..7bd6c373615e2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala @@ -90,7 +90,7 @@ private[ml] object LSHTest { // Compute precision and recall val correctCount = expected.join(actual, model.getInputCol).count().toDouble - (correctCount / expected.count(), correctCount / actual.count()) + (correctCount / actual.count(), correctCount / expected.count()) } /** From d3891597ffc62954e32f8a34ae0c3a54c1fef94a Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Mon, 19 Sep 2016 15:38:08 -0700 Subject: [PATCH 08/45] Code review comments. A new unit test of k nearest neighbor for large k --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 9 ++++++--- .../spark/ml/lsh/RandomProjectionSuite.scala | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index b7e5ac44cefdb..d80a136b15d43 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -138,11 +138,14 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] // In the origin dataset, find the hash value u that is closest to v val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) - val nearestHashDataset = modelDataset.select(min(hashDistUDF(col($(outputCol))))) - val nearestHashValue = nearestHashDataset.collect()(0)(0).asInstanceOf[Double] + + // Compute threshold to get exact k elements. + val modelDatasetSortedByHash = modelDataset.sort(hashDistUDF(col($(outputCol)))).limit(k) + val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistUDF(col($(outputCol))))) + val hashThreshold = thresholdDataset.collect()(0)(0).asInstanceOf[Double] // Filter the dataset where the hash value equals to u - val modelSubset = modelDataset.filter(hashDistUDF(col($(outputCol))) === nearestHashValue) + val modelSubset = modelDataset.filter(hashDistUDF(col($(outputCol))) <= hashThreshold) // Get the top k nearest neighbor by their distance to the key val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala index cc3611a62105a..b92548c49bd2d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala @@ -81,6 +81,24 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(recall >= 0.7) } + test("approxNearestNeighbors for small bucket and large k") { + val data = { + for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + val key = Vectors.dense(1.2, 3.4) + + val rp = new RandomProjection() + .setOutputDim(20) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(1.0) + + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 100) + assert(precision >= 0.7) + assert(recall >= 0.7) + } + test("approxSimilarityJoin for random projection on different dataset") { val dataA = { for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) From 19d012a7f2c4fcc3bc7149944c30babe78cb4ea7 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Mon, 19 Sep 2016 16:19:31 -0700 Subject: [PATCH 09/45] (1) Refactor hashDistCol for nearest neighbor search. (2) Add scaladoc for LSH along with reference papers --- .../scala/org/apache/spark/ml/lsh/LSH.scala | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala index d80a136b15d43..1611c650353ca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala @@ -138,14 +138,15 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] // In the origin dataset, find the hash value u that is closest to v val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) + val hashDistCol = hashDistUDF(col($(outputCol))) // Compute threshold to get exact k elements. - val modelDatasetSortedByHash = modelDataset.sort(hashDistUDF(col($(outputCol)))).limit(k) - val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistUDF(col($(outputCol))))) + val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(k) + val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol)) val hashThreshold = thresholdDataset.collect()(0)(0).asInstanceOf[Double] - // Filter the dataset where the hash value equals to u - val modelSubset = modelDataset.filter(hashDistUDF(col($(outputCol))) <= hashThreshold) + // Filter the dataset where the hash value is less than the threshold. + val modelSubset = modelDataset.filter(hashDistCol <= hashThreshold) // Get the top k nearest neighbor by their distance to the key val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) @@ -226,6 +227,22 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] } } +/** + * Locality Sensitive Hashing for different metrics space. Support basic transformation with a new + * hash column, approximate nearest neighbor search with a dataset and a key, and approximate + * similarity join of two datasets. + * + * Currently the following LSH family is implemented: + * - Euclidean Distance: Random Projection + * + * References: + * (1) Gionis, Aristides, Piotr Indyk, and Rajeev Motwani. "Similarity search in high dimensions + * via hashing." VLDB 7 Sep. 1999: 518-529. + * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint + * arXiv:1408.2927 (2014). + * @tparam KeyType The input key type of LSH + * @tparam T The class type of lsh + */ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with LSHParams { /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) From 269c8c91dfbc20d84a4e2e658a910b5adc68314c Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 20 Sep 2016 11:31:15 -0400 Subject: [PATCH 10/45] Code Review comments: (1) Rewrite hashDistance (2) Move the lsh package to be under feature --- .../scala/org/apache/spark/ml/{ => feature}/lsh/LSH.scala | 5 +++-- .../apache/spark/ml/{ => feature}/lsh/RandomProjection.scala | 2 +- .../org/apache/spark/ml/{ => feature}/lsh/LSHTest.scala | 2 +- .../spark/ml/{ => feature}/lsh/RandomProjectionSuite.scala | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) rename mllib/src/main/scala/org/apache/spark/ml/{ => feature}/lsh/LSH.scala (98%) rename mllib/src/main/scala/org/apache/spark/ml/{ => feature}/lsh/RandomProjection.scala (98%) rename mllib/src/test/scala/org/apache/spark/ml/{ => feature}/lsh/LSHTest.scala (99%) rename mllib/src/test/scala/org/apache/spark/ml/{ => feature}/lsh/RandomProjectionSuite.scala (99%) diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala similarity index 98% rename from mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala rename to mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 1611c650353ca..41315b28d2731 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.lsh +package org.apache.spark.ml.feature.lsh import scala.util.Random @@ -95,7 +95,8 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @return The distance between hash vectors x and y in double */ protected[ml] def hashDistance(x: Vector, y: Vector): Double = { - (x.asBreeze - y.asBreeze).toArray.map(math.abs).min + // Since it's generated by hashing, it will be a pair of dense vectors. + x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala similarity index 98% rename from mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala rename to mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala index e77a1a87b34c6..5a19a21ff913f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.lsh +package org.apache.spark.ml.feature.lsh import scala.util.Random diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala similarity index 99% rename from mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala rename to mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala index 7bd6c373615e2..83ff49b19b61c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.lsh +package org.apache.spark.ml.feature.lsh import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Dataset diff --git a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala similarity index 99% rename from mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala rename to mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala index b92548c49bd2d..f31f4cbd9adc7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.lsh +package org.apache.spark.ml.feature.lsh import breeze.numerics.{cos, sin} import breeze.numerics.constants.Pi From 9065f7d31e81045f96ec4502fc7078b3d89d9d72 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 20 Sep 2016 12:01:43 -0400 Subject: [PATCH 11/45] Add comment to clarify the implementation of RandomProjection --- .../apache/spark/ml/feature/lsh/RandomProjection.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala index 5a19a21ff913f..f58bef9f97702 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala @@ -52,6 +52,14 @@ class RandomProjectionModel( } } +/** + * This [[RandomProjection]] implements Locality Sensitive Hashing functions with 2-stable + * distributions. If you are looking for LSH for cos distance, please use [[SignRandomProjection]] + * + * References: + * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint + * arXiv:1408.2927 (2014). + */ class RandomProjection(override val uid: String) extends LSH[Vector, RandomProjectionModel] with RandomProjectionParams { From d22dff4d88754680d23b11c406d9189a964d0ebd Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 26 Sep 2016 01:19:00 -0400 Subject: [PATCH 12/45] Implementation of MinHash with unit tests --- .../org/apache/spark/ml/feature/lsh/LSH.scala | 7 +- .../apache/spark/ml/feature/lsh/MinHash.scala | 94 +++++++++++++++++++ .../ml/feature/lsh/RandomProjection.scala | 6 +- .../spark/ml/feature/lsh/MinHashSuite.scala | 81 ++++++++++++++++ 4 files changed, 182 insertions(+), 6 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 41315b28d2731..51cbdfc61078f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -260,10 +260,10 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have * different initial setting, developer needs to define how their LSHModel is created instead of * using reflection in this abstract class. - * @param inputDim the input dimension of input dataset + * @param dataset The input dataset of LSH fit * @return A new LSHModel instance without any params */ - protected[this] def createRawLSHModel(inputDim: Int): T + protected[this] def createRawLSHModel(dataset: Dataset[_]): T override def copy(extra: ParamMap): Estimator[T] = defaultCopy(extra) @@ -271,8 +271,7 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with * Fits a model to the input data. */ override def fit(dataset: Dataset[_]): T = { - val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size - val model = createRawLSHModel(inputDim).setParent(this) + val model = createRawLSHModel(dataset).setParent(this) copyValues(model) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala new file mode 100644 index 0000000000000..5bb85d06745db --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature.lsh + +import scala.util.Random + +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.Dataset + +/** + * Params for [[MinHash]]. + */ +private[ml] trait MinHashModelParams extends Params { + protected[this] val prime = 2038074743 + + val numIndex: IntParam = new IntParam(this, "numIndex", "the number of index", + ParamValidators.inRange(0, prime, lowerInclusive = false, upperInclusive = false)) +} + +class MinHashModel(override val uid: String, hashFunctions: Seq[Double => Double]) + extends LSHModel[Seq[Double], MinHashModel] with MinHashModelParams { + + override protected[this] val hashFunction: Seq[Double] => Vector = { + elems: Seq[Double] => + Vectors.dense(hashFunctions.map( + func => elems.map(func).min + ).toArray) + } + + /** + * :: DeveloperApi :: + * + * Calculate the distance between two different keys using the distance metric corresponding + * to the hashFunction + * + * @param x One of the point in the metric space + * @param y Another the point in the metric space + * @return The distance between x and y in double + */ + override protected[ml] def keyDistance(x: Seq[Double], y: Seq[Double]): Double = { + val xSet = x.toSet + val ySet = y.toSet + 1 - xSet.intersect(ySet).size.toDouble / xSet.union(ySet).size.toDouble + } +} + +/** + * LSH class for Jaccard distance + * @param uid + */ +class MinHash(override val uid: String) extends LSH[Seq[Double], MinHashModel] + with MinHashModelParams { + + private[this] lazy val randSeq: Seq[Int] = { + Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)).take($(outputDim)) + } + + private[this] lazy val hashFunctions: Seq[Double => Double] = { + (0 until $(outputDim)).map { + i: Int => { + // Perfect Hash function, use 2n buckets to reduce collision. + elem: Double => (1 + elem) * randSeq(i).toLong % prime % ($(numIndex) * 2) + } + } + } + + def this() = { + this(Identifiable.randomUID("min hash")) + } + + override protected[this] def createRawLSHModel(dataset: Dataset[_]): MinHashModel = { + new MinHashModel(uid, hashFunctions) + } + + /** @group setParam */ + def setNumIndex(value: Int): this.type = set(numIndex, value) +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala index f58bef9f97702..559335e9396df 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala @@ -24,6 +24,8 @@ import breeze.linalg.normalize import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions._ /** * Params for [[RandomProjection]]. @@ -79,8 +81,8 @@ class RandomProjection(override val uid: String) extends LSH[Vector, RandomProje /** @group setParam */ def setBucketLength(value: Double): this.type = set(bucketLength, value) - override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { - this.inputDim = inputDim + override protected[this] def createRawLSHModel(dataset: Dataset[_]): RandomProjectionModel = { + this.inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size new RandomProjectionModel(uid, randUnitVectors) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala new file mode 100644 index 0000000000000..7ff102981d0c6 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature.lsh + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { + test("MinHash") { + val data = { + for (i <- 0 to 95) yield (i until i + 5).map(_.toDouble).toArray + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val mh = new MinHash() + .setOutputDim(1) + .setInputCol("keys") + .setOutputCol("values") + .setNumIndex(100) + + val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, mh, 0.75, 0.5) + assert(falsePositive < 0.1) + assert(falseNegative < 0.1) + } + + test("approxNearestNeighbors for min hash") { + val data = { + for (i <- 0 to 95) yield (i until i + 5).map(_.toDouble).toArray + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val mh = new MinHash() + .setOutputDim(20) + .setInputCol("keys") + .setOutputCol("values") + .setNumIndex(100) + + val key: Seq[Double] = (0 until 100).filter(_.toString.contains("1")).map(_.toDouble) + + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20) + assert(precision >= 0.7) + assert(recall >= 0.7) + } + + test("approxSimilarityJoin for minhash on different dataset") { + val dataA = { + for (i <- 0 to 20) yield (5 * i until 5 * i + 5).map(_.toDouble).toArray + } + val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") + + val dataB = { + for (i <- 0 to 30) yield (3 * i until 3 * i + 3).map(_.toDouble).toArray + } + val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + + val mh = new MinHash() + .setOutputDim(20) + .setInputCol("keys") + .setOutputCol("values") + .setNumIndex(100) + + val (precision, recall) = LSHTest.checkApproxSimilarityJoin(mh, dfA, dfB, 0.5) + assert(precision == 1.0) + assert(recall >= 0.9) + } +} From 7e6d9383ceb353a726eef35af56dc915f67dbe77 Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 26 Sep 2016 01:55:33 -0400 Subject: [PATCH 13/45] Add options for Probing Single/Multiple bucket(s) in approxNearestNeighbors --- .../org/apache/spark/ml/feature/lsh/LSH.scala | 23 ++++++++++++++----- .../apache/spark/ml/feature/lsh/LSHTest.scala | 5 ++-- .../spark/ml/feature/lsh/MinHashSuite.scala | 2 +- .../feature/lsh/RandomProjectionSuite.scala | 8 ++++--- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 51cbdfc61078f..41caede87af7f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -123,14 +123,21 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] /** * Given a large dataset and an item, approximately find at most k items which have the closest * distance to the item. + * + * This method has implemented two way of fetching k nearest neighbors: + * Single Probing: Fast, return at most k elements (Probing only one buckets) + * Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key) + * * @param dataset the dataset to look for the key * @param key The key to hash for the item * @param k The maximum number of items closest to the key + * @param singleProbing True for using Single Probing; false for multiple probing * @param distCol The column to store the distance between pairs * @return A dataset containing at most k items closest to the key. A distCol is added to show * the distance between each record and the key. */ def approxNearestNeighbors(dataset: Dataset[_], key: KeyType, k: Int = 1, + singleProbing: Boolean = true, distCol: String = "distance"): Dataset[_] = { assert(k > 0, "The number of nearest neighbors cannot be less than 1") // Get Hash Value of the key v @@ -141,13 +148,17 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) val hashDistCol = hashDistUDF(col($(outputCol))) - // Compute threshold to get exact k elements. - val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(k) - val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol)) - val hashThreshold = thresholdDataset.collect()(0)(0).asInstanceOf[Double] + val modelSubset = if (singleProbing) { + modelDataset.filter(hashDistCol === 0.0) + } else { + // Compute threshold to get exact k elements. + val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(k) + val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol)) + val hashThreshold = thresholdDataset.collect()(0)(0).asInstanceOf[Double] - // Filter the dataset where the hash value is less than the threshold. - val modelSubset = modelDataset.filter(hashDistCol <= hashThreshold) + // Filter the dataset where the hash value is less than the threshold. + modelDataset.filter(hashDistCol <= hashThreshold) + } // Get the top k nearest neighbor by their distance to the key val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala index 83ff49b19b61c..716b6dbe6cd80 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala @@ -78,7 +78,8 @@ private[ml] object LSHTest { * @return A tuple of two doubles, representing precision and recall rate */ def checkApproxNearestNeighbors[KeyType, T <: LSHModel[KeyType, T]] - (lsh: LSH[KeyType, T], dataset: Dataset[_], key: KeyType, k: Int): (Double, Double) = { + (lsh: LSH[KeyType, T], dataset: Dataset[_], key: KeyType, k: Int, + singleProbing: Boolean): (Double, Double) = { val model = lsh.fit(dataset) // Compute expected @@ -86,7 +87,7 @@ private[ml] object LSHTest { val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k) // Compute actual - val actual = model.approxNearestNeighbors(dataset, key, k) + val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing) // Compute precision and recall val correctCount = expected.join(actual, model.getInputCol).count().toDouble diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala index 7ff102981d0c6..01a60ea0bb27a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -52,7 +52,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { val key: Seq[Double] = (0 until 100).filter(_.toString.contains("1")).map(_.toDouble) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20) + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) assert(precision >= 0.7) assert(recall >= 0.7) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala index f31f4cbd9adc7..4653ff98714f2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala @@ -76,12 +76,13 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(4.0) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 10) + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 10, + singleProbing = true) assert(precision >= 0.7) assert(recall >= 0.7) } - test("approxNearestNeighbors for small bucket and large k") { + test("approxNearestNeighbors with multiple probing") { val data = { for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) } @@ -94,7 +95,8 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(1.0) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 100) + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 100, + singleProbing = false) assert(precision >= 0.7) assert(recall >= 0.7) } From 0fad3efbb9da20f0f71ae7e88885fb31cda13d04 Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 26 Sep 2016 02:11:15 -0400 Subject: [PATCH 14/45] Allow users to transform datasets themselves before doing approxNearestNeighbors and approxSimilarityJoin. This improves the performance of multiple queries on the same dataset(s) --- .../scala/org/apache/spark/ml/feature/lsh/LSH.scala | 10 +++++++--- .../org/apache/spark/ml/feature/lsh/MinHashSuite.scala | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 41caede87af7f..7ded8ba16ae22 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -103,9 +103,13 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * Transforms the input dataset. */ override def transform(dataset: Dataset[_]): DataFrame = { - transformSchema(dataset.schema, logging = true) - val transformUDF = udf(hashFunction, new VectorUDT) - dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) + if (!dataset.columns.contains($(outputCol))) { + transformSchema(dataset.schema, logging = true) + val transformUDF = udf(hashFunction, new VectorUDT) + dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) + } else { + dataset.toDF() + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala index 01a60ea0bb27a..11f1b15af8928 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -52,7 +52,8 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { val key: Seq[Double] = (0 until 100).filter(_.toString.contains("1")).map(_.toDouble) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) + val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20, + singleProbing = true) assert(precision >= 0.7) assert(recall >= 0.7) } From 0080b878553532956c6b319a3c5a3c614a6d1159 Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 15:00:53 -0400 Subject: [PATCH 15/45] Generalize Input types to Vector. For MinHash, use Sparse Vectors to represent sets --- .../org/apache/spark/ml/feature/lsh/LSH.scala | 22 +++---- .../apache/spark/ml/feature/lsh/MinHash.scala | 58 ++++++++----------- .../ml/feature/lsh/RandomProjection.scala | 24 +++----- .../apache/spark/ml/feature/lsh/LSHTest.scala | 21 +++---- .../spark/ml/feature/lsh/MinHashSuite.scala | 15 +++-- 5 files changed, 59 insertions(+), 81 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 7ded8ba16ae22..1e736f7df5b67 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -62,7 +62,7 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Model produced by [[LSH]]. */ -abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] +abstract class LSHModel[T <: LSHModel[T]] private[ml] extends Model[T] with LSHParams { override def copy(extra: ParamMap): T = defaultCopy(extra) /** @@ -71,7 +71,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * The hash function of LSH, mapping a predefined KeyType to a Vector * @return The mapping of LSH function. */ - protected[this] val hashFunction: KeyType => Vector + protected[this] val hashFunction: Vector => Vector /** * :: DeveloperApi :: @@ -82,7 +82,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @param y Another the point in the metric space * @return The distance between x and y in double */ - protected[ml] def keyDistance(x: KeyType, y: KeyType): Double + protected[ml] def keyDistance(x: Vector, y: Vector): Double /** * :: DeveloperApi :: @@ -140,7 +140,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * @return A dataset containing at most k items closest to the key. A distCol is added to show * the distance between each record and the key. */ - def approxNearestNeighbors(dataset: Dataset[_], key: KeyType, k: Int = 1, + def approxNearestNeighbors(dataset: Dataset[_], key: Vector, k: Int = 1, singleProbing: Boolean = true, distCol: String = "distance"): Dataset[_] = { assert(k > 0, "The number of nearest neighbors cannot be less than 1") @@ -165,7 +165,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] } // Get the top k nearest neighbor by their distance to the key - val keyDistUDF = udf((x: KeyType) => keyDistance(x, key), DataTypes.DoubleType) + val keyDistUDF = udf((x: Vector) => keyDistance(x, key), DataTypes.DoubleType) val modelSubsetWithDistCol = modelSubset.withColumn(distCol, keyDistUDF(col($(inputCol)))) modelSubsetWithDistCol.sort(distCol).limit(k) } @@ -233,7 +233,7 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] .drop(explodeCols: _*) // Add a new column to store the distance of the two records. - val distUDF = udf((x: KeyType, y: KeyType) => keyDistance(x, y), DataTypes.DoubleType) + val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), distUDF(explodedA($(inputCol)), explodedB($(inputCol))).as(distCol) ) @@ -256,10 +256,9 @@ abstract class LSHModel[KeyType, T <: LSHModel[KeyType, T]] private[ml] * via hashing." VLDB 7 Sep. 1999: 518-529. * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). - * @tparam KeyType The input key type of LSH * @tparam T The class type of lsh */ -abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with LSHParams { +abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -275,10 +274,10 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have * different initial setting, developer needs to define how their LSHModel is created instead of * using reflection in this abstract class. - * @param dataset The input dataset of LSH fit + * @param inputDim The dimension of the input dataset * @return A new LSHModel instance without any params */ - protected[this] def createRawLSHModel(dataset: Dataset[_]): T + protected[this] def createRawLSHModel(inputDim: Int): T override def copy(extra: ParamMap): Estimator[T] = defaultCopy(extra) @@ -286,7 +285,8 @@ abstract class LSH[KeyType, T <: LSHModel[KeyType, T]] extends Estimator[T] with * Fits a model to the input data. */ override def fit(dataset: Dataset[_]): T = { - val model = createRawLSHModel(dataset).setParent(this) + val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size + val model = createRawLSHModel(inputDim).setParent(this) copyValues(model) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala index 5bb85d06745db..518c2483e3c3f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala @@ -20,27 +20,15 @@ package org.apache.spark.ml.feature.lsh import scala.util.Random import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.Dataset -/** - * Params for [[MinHash]]. - */ -private[ml] trait MinHashModelParams extends Params { - protected[this] val prime = 2038074743 - - val numIndex: IntParam = new IntParam(this, "numIndex", "the number of index", - ParamValidators.inRange(0, prime, lowerInclusive = false, upperInclusive = false)) -} - -class MinHashModel(override val uid: String, hashFunctions: Seq[Double => Double]) - extends LSHModel[Seq[Double], MinHashModel] with MinHashModelParams { +class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) + extends LSHModel[MinHashModel] { - override protected[this] val hashFunction: Seq[Double] => Vector = { - elems: Seq[Double] => + override protected[this] val hashFunction: Vector => Vector = { + elems: Vector => Vectors.dense(hashFunctions.map( - func => elems.map(func).min + func => elems.toSparse.indices.toList.map(func).min.toDouble ).toArray) } @@ -54,41 +42,41 @@ class MinHashModel(override val uid: String, hashFunctions: Seq[Double => Double * @param y Another the point in the metric space * @return The distance between x and y in double */ - override protected[ml] def keyDistance(x: Seq[Double], y: Seq[Double]): Double = { - val xSet = x.toSet - val ySet = y.toSet + override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { + val xSet = x.toSparse.indices.toSet + val ySet = y.toSparse.indices.toSet 1 - xSet.intersect(ySet).size.toDouble / xSet.union(ySet).size.toDouble } } /** * LSH class for Jaccard distance + * The input set should be represented in sparse vector form. For example, + * Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)]) + * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5 * @param uid */ -class MinHash(override val uid: String) extends LSH[Seq[Double], MinHashModel] - with MinHashModelParams { +class MinHash(override val uid: String) extends LSH[MinHashModel] { + + protected[this] val prime = 2038074743 private[this] lazy val randSeq: Seq[Int] = { Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)).take($(outputDim)) } - private[this] lazy val hashFunctions: Seq[Double => Double] = { - (0 until $(outputDim)).map { - i: Int => { - // Perfect Hash function, use 2n buckets to reduce collision. - elem: Double => (1 + elem) * randSeq(i).toLong % prime % ($(numIndex) * 2) - } - } - } - def this() = { this(Identifiable.randomUID("min hash")) } - override protected[this] def createRawLSHModel(dataset: Dataset[_]): MinHashModel = { + override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { + val hashFunctions: Seq[Int => Long] = { + (0 until $(outputDim)).map { + i: Int => { + // Perfect Hash function, use 2n buckets to reduce collision. + elem: Int => (1 + elem) * randSeq(i).toLong % prime % (inputDim * 2) + } + } + } new MinHashModel(uid, hashFunctions) } - - /** @group setParam */ - def setNumIndex(value: Int): this.type = set(numIndex, value) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala index 559335e9396df..4ab571e784ef5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala @@ -24,8 +24,6 @@ import breeze.linalg.normalize import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.functions._ /** * Params for [[RandomProjection]]. @@ -38,7 +36,7 @@ private[ml] trait RandomProjectionParams extends Params { class RandomProjectionModel( override val uid: String, val randUnitVectors: Array[Vector]) - extends LSHModel[Vector, RandomProjectionModel] with RandomProjectionParams { + extends LSHModel[RandomProjectionModel] with RandomProjectionParams { override protected[this] val hashFunction: (Vector) => Vector = { key: Vector => { @@ -62,18 +60,9 @@ class RandomProjectionModel( * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ -class RandomProjection(override val uid: String) extends LSH[Vector, RandomProjectionModel] +class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel] with RandomProjectionParams { - private[this] var inputDim = -1 - - private[this] lazy val randUnitVectors: Array[Vector] = { - Array.fill($(outputDim)) { - val randArray = Array.fill(inputDim)(Random.nextGaussian()) - Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) - } - } - def this() = { this(Identifiable.randomUID("random projection")) } @@ -81,8 +70,13 @@ class RandomProjection(override val uid: String) extends LSH[Vector, RandomProje /** @group setParam */ def setBucketLength(value: Double): this.type = set(bucketLength, value) - override protected[this] def createRawLSHModel(dataset: Dataset[_]): RandomProjectionModel = { - this.inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size + override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { + val randUnitVectors: Array[Vector] = { + Array.fill($(outputDim)) { + val randArray = Array.fill(inputDim)(Random.nextGaussian()) + Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) + } + } new RandomProjectionModel(uid, randUnitVectors) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala index 716b6dbe6cd80..9ec91885c86c2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala @@ -39,12 +39,11 @@ private[ml] object LSHTest { * @param lsh The lsh instance to perform the hashing * @param dist1 Distance threshold for false positive * @param dist2 Distance threshold for false negative - * @tparam KeyType The input key type of LSH * @tparam T The class type of lsh * @return A tuple of two doubles, representing the false positive and false negative rate */ - def checkLSHProperty[KeyType, T <: LSHModel[KeyType, T]] - (dataset: Dataset[_], lsh: LSH[KeyType, T], dist1: Double, dist2: Double): (Double, Double) = { + def checkLSHProperty[T <: LSHModel[T]] + (dataset: Dataset[_], lsh: LSH[T], dist1: Double, dist2: Double): (Double, Double) = { val model = lsh.fit(dataset) val inputCol = model.getInputCol val outputCol = model.getOutputCol @@ -52,7 +51,7 @@ private[ml] object LSHTest { // Perform a cross join and label each pair of same_bucket and distance val pairs = transformedData.as("a").crossJoin(transformedData.as("b")) - val distUDF = udf((x: KeyType, y: KeyType) => model.keyDistance(x, y), DataTypes.DoubleType) + val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType) val sameBucket = udf((x: Vector, y: Vector) => model.hashDistance(x, y) == 0.0, DataTypes.BooleanType) val result = pairs @@ -73,17 +72,16 @@ private[ml] object LSHTest { * @param dataset the dataset to look for the key * @param key The key to hash for the item * @param k The maximum number of items closest to the key - * @tparam KeyType The input key type of LSH * @tparam T The class type of lsh * @return A tuple of two doubles, representing precision and recall rate */ - def checkApproxNearestNeighbors[KeyType, T <: LSHModel[KeyType, T]] - (lsh: LSH[KeyType, T], dataset: Dataset[_], key: KeyType, k: Int, + def checkApproxNearestNeighbors[T <: LSHModel[T]] + (lsh: LSH[T], dataset: Dataset[_], key: Vector, k: Int, singleProbing: Boolean): (Double, Double) = { val model = lsh.fit(dataset) // Compute expected - val distUDF = udf((x: KeyType) => model.keyDistance(x, key), DataTypes.DoubleType) + val distUDF = udf((x: Vector) => model.keyDistance(x, key), DataTypes.DoubleType) val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k) // Compute actual @@ -100,18 +98,17 @@ private[ml] object LSHTest { * @param datasetA One of the datasets to join * @param datasetB Another dataset to join * @param threshold The threshold for the distance of record pairs - * @tparam KeyType The input key type of LSH * @tparam T The class type of lsh * @return A tuple of two doubles, representing precision and recall rate */ - def checkApproxSimilarityJoin[KeyType, T <: LSHModel[KeyType, T]] - (lsh: LSH[KeyType, T], datasetA: Dataset[_], datasetB: Dataset[_], + def checkApproxSimilarityJoin[T <: LSHModel[T]] + (lsh: LSH[T], datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double): (Double, Double) = { val model = lsh.fit(datasetA) val inputCol = model.getInputCol // Compute expected - val distUDF = udf((x: KeyType, y: KeyType) => model.keyDistance(x, y), DataTypes.DoubleType) + val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType) val expected = datasetA.as("a").crossJoin(datasetB.as("b")) .filter(distUDF(col(s"a.$inputCol"), col(s"b.$inputCol")) < threshold) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala index 11f1b15af8928..875c2827c9548 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -18,12 +18,13 @@ package org.apache.spark.ml.feature.lsh import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { test("MinHash") { val data = { - for (i <- 0 to 95) yield (i until i + 5).map(_.toDouble).toArray + for (i <- 0 to 95) yield Vectors.sparse(100, (i until i + 5).map((_, 1.0))) } val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") @@ -31,7 +32,6 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") - .setNumIndex(100) val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, mh, 0.75, 0.5) assert(falsePositive < 0.1) @@ -40,7 +40,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { test("approxNearestNeighbors for min hash") { val data = { - for (i <- 0 to 95) yield (i until i + 5).map(_.toDouble).toArray + for (i <- 0 to 95) yield Vectors.sparse(100, (i until i + 5).map((_, 1.0))) } val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") @@ -48,9 +48,9 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setNumIndex(100) - val key: Seq[Double] = (0 until 100).filter(_.toString.contains("1")).map(_.toDouble) + val key: Vector = Vectors.sparse(100, + (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) @@ -60,12 +60,12 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { test("approxSimilarityJoin for minhash on different dataset") { val dataA = { - for (i <- 0 to 20) yield (5 * i until 5 * i + 5).map(_.toDouble).toArray + for (i <- 0 until 20) yield Vectors.sparse(100, (5 * i until 5 * i + 5).map((_, 1.0))) } val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") val dataB = { - for (i <- 0 to 30) yield (3 * i until 3 * i + 3).map(_.toDouble).toArray + for (i <- 0 until 30) yield Vectors.sparse(100, (3 * i until 3 * i + 3).map((_, 1.0))) } val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") @@ -73,7 +73,6 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setNumIndex(100) val (precision, recall) = LSHTest.checkApproxSimilarityJoin(mh, dfA, dfB, 0.5) assert(precision == 1.0) From a1c344bb31904ea41c05f74516d1fcb1ad61a427 Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 15:40:09 -0400 Subject: [PATCH 16/45] Code Review Comments --- .../org/apache/spark/ml/feature/lsh/LSH.scala | 76 ++++++++++++------- .../apache/spark/ml/feature/lsh/MinHash.scala | 21 +++-- .../ml/feature/lsh/RandomProjection.scala | 14 ++++ .../apache/spark/ml/feature/lsh/LSHTest.scala | 28 ++++--- .../spark/ml/feature/lsh/MinHashSuite.scala | 6 +- .../feature/lsh/RandomProjectionSuite.scala | 12 +-- 6 files changed, 104 insertions(+), 53 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 1e736f7df5b67..7f35b0439a30b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -23,6 +23,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.sql._ import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ @@ -33,8 +34,11 @@ import org.apache.spark.sql.types._ */ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** - * Param for output dimension. + * Param for the dimension of LSH OR-amplification. * + * In this implementation, we use LSH OR-amplification to reduce the false negative rate. This + * param is the dimension of the amplification. The higher the dimension is, the lower the false + * negative rate. * @group param */ final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension", @@ -43,19 +47,15 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** @group getParam */ final def getOutputDim: Int = $(outputDim) - setDefault(outputDim -> 1) - - setDefault(outputCol -> "lsh_output") + setDefault(outputDim -> 1, outputCol -> "lsh_output") /** * Transform the Schema for LSH * @param schema The schema of the input dataset without outputCol * @return A derived schema with outputCol added */ - final def transformLSHSchema(schema: StructType): StructType = { - val outputFields = schema.fields :+ - StructField($(outputCol), new VectorUDT, nullable = false) - StructType(outputFields) + protected[this] final def validateAndTransformSchema(schema: StructType): StructType = { + SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT) } } @@ -94,22 +94,15 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @param y Another hash vector * @return The distance between hash vectors x and y in double */ - protected[ml] def hashDistance(x: Vector, y: Vector): Double = { - // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min - } + protected[ml] def hashDistance(x: Vector, y: Vector): Double /** * Transforms the input dataset. */ override def transform(dataset: Dataset[_]): DataFrame = { - if (!dataset.columns.contains($(outputCol))) { - transformSchema(dataset.schema, logging = true) - val transformUDF = udf(hashFunction, new VectorUDT) - dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) - } else { - dataset.toDF() - } + transformSchema(dataset.schema, logging = true) + val transformUDF = udf(hashFunction, new VectorUDT) + dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } /** @@ -121,7 +114,7 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * validity, including complex parameter interaction checks. */ override def transformSchema(schema: StructType): StructType = { - transformLSHSchema(schema) + validateAndTransformSchema(schema) } /** @@ -140,13 +133,20 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @return A dataset containing at most k items closest to the key. A distCol is added to show * the distance between each record and the key. */ - def approxNearestNeighbors(dataset: Dataset[_], key: Vector, k: Int = 1, - singleProbing: Boolean = true, - distCol: String = "distance"): Dataset[_] = { + def approxNearestNeighbors( + dataset: Dataset[_], + key: Vector, + k: Int, + singleProbing: Boolean, + distCol: String): Dataset[_] = { assert(k > 0, "The number of nearest neighbors cannot be less than 1") // Get Hash Value of the key v val keyHash = hashFunction(key) - val modelDataset = transform(dataset) + val modelDataset = if (!dataset.columns.contains($(outputCol))) { + transform(dataset) + } else { + dataset + } // In the origin dataset, find the hash value u that is closest to v val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) @@ -170,6 +170,10 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] modelSubsetWithDistCol.sort(distCol).limit(k) } + def approxNearestNeighbors(dataset: Dataset[_], key: Vector, k: Int): Dataset[_] = { + approxNearestNeighbors(dataset, key, k, true, "distCol") + } + /** * Preprocess step for approximate similarity join. Transform and explode the outputCol to * explodeCols. @@ -183,8 +187,12 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] } val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, MapType(DataTypes.IntegerType, DataTypes.DoubleType)) - transform(dataset) - .select(col("*"), explode(vectorToMap(col($(outputCol)))).as(explodeCols)) + val modelDataset = if (!dataset.columns.contains($(outputCol))) { + transform(dataset) + } else { + dataset + } + modelDataset.select(col("*"), explode(vectorToMap(col($(outputCol)))).as(explodeCols)) } /** @@ -213,8 +221,11 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @return A joined dataset containing pairs of records. A distCol is added to show the distance * between each pair of records. */ - def approxSimilarityJoin(datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double, - distCol: String = "distance"): Dataset[_] = { + def approxSimilarityJoin( + datasetA: Dataset[_], + datasetB: Dataset[_], + threshold: Double, + distCol: String): Dataset[_] = { val explodeCols = Seq("lsh#entry", "lsh#hashValue") val explodedA = processDataset(datasetA, explodeCols) @@ -241,6 +252,13 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] // Filter the joined datasets where the distance are smaller than the threshold. joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() } + + def approxSimilarityJoin( + datasetA: Dataset[_], + datasetB: Dataset[_], + threshold: Double): Dataset[_] = { + approxSimilarityJoin(datasetA, datasetB, threshold, "distCol") + } } /** @@ -299,6 +317,6 @@ abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { * validity, including complex parameter interaction checks. */ override def transformSchema(schema: StructType): StructType = { - transformLSHSchema(schema) + validateAndTransformSchema(schema) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala index 518c2483e3c3f..594d7c87a5b64 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala @@ -47,6 +47,11 @@ class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) val ySet = y.toSparse.indices.toSet 1 - xSet.intersect(ySet).size.toDouble / xSet.union(ySet).size.toDouble } + + override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { + // Since it's generated by hashing, it will be a pair of dense vectors. + x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + } } /** @@ -60,6 +65,12 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] { protected[this] val prime = 2038074743 + override def setInputCol(value: String): this.type = super.setInputCol(value) + + override def setOutputCol(value: String): this.type = super.setOutputCol(value) + + override def setOutputDim(value: Int): this.type = super.setOutputDim(value) + private[this] lazy val randSeq: Seq[Int] = { Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)).take($(outputDim)) } @@ -69,12 +80,12 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] { } override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { + val numEntry = inputDim * 2 + assert(numEntry < prime, "The input vector dimension is too large for MinHash to handle.") val hashFunctions: Seq[Int => Long] = { - (0 until $(outputDim)).map { - i: Int => { - // Perfect Hash function, use 2n buckets to reduce collision. - elem: Int => (1 + elem) * randSeq(i).toLong % prime % (inputDim * 2) - } + (0 until $(outputDim)).map { i: Int => + // Perfect Hash function, use 2n buckets to reduce collision. + elem: Int => (1 + elem) * randSeq(i).toLong % prime % numEntry } } new MinHashModel(uid, hashFunctions) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala index 4ab571e784ef5..0cf1ec06c890e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala @@ -31,6 +31,9 @@ import org.apache.spark.ml.util.Identifiable private[ml] trait RandomProjectionParams extends Params { val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", "the length of each hash bucket", ParamValidators.gt(0)) + + /** @group getParam */ + final def getBucketLength: Double = $(bucketLength) } class RandomProjectionModel( @@ -50,6 +53,11 @@ class RandomProjectionModel( override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { Math.sqrt(Vectors.sqdist(x, y)) } + + override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { + // Since it's generated by hashing, it will be a pair of dense vectors. + x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + } } /** @@ -63,6 +71,12 @@ class RandomProjectionModel( class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel] with RandomProjectionParams { + override def setInputCol(value: String): this.type = super.setInputCol(value) + + override def setOutputCol(value: String): this.type = super.setOutputCol(value) + + override def setOutputDim(value: Int): this.type = super.setOutputDim(value) + def this() = { this(Identifiable.randomUID("random projection")) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala index 9ec91885c86c2..8fb09c7910561 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala @@ -42,8 +42,11 @@ private[ml] object LSHTest { * @tparam T The class type of lsh * @return A tuple of two doubles, representing the false positive and false negative rate */ - def checkLSHProperty[T <: LSHModel[T]] - (dataset: Dataset[_], lsh: LSH[T], dist1: Double, dist2: Double): (Double, Double) = { + def calculateLSHProperty[T <: LSHModel[T]]( + dataset: Dataset[_], + lsh: LSH[T], + dist1: Double, + dist2: Double): (Double, Double) = { val model = lsh.fit(dataset) val inputCol = model.getInputCol val outputCol = model.getOutputCol @@ -67,7 +70,7 @@ private[ml] object LSHTest { } /** - * Check and compute the precision and recall of approximate nearest neighbors + * Compute the precision and recall of approximate nearest neighbors * @param lsh The lsh instance * @param dataset the dataset to look for the key * @param key The key to hash for the item @@ -75,9 +78,12 @@ private[ml] object LSHTest { * @tparam T The class type of lsh * @return A tuple of two doubles, representing precision and recall rate */ - def checkApproxNearestNeighbors[T <: LSHModel[T]] - (lsh: LSH[T], dataset: Dataset[_], key: Vector, k: Int, - singleProbing: Boolean): (Double, Double) = { + def calculateApproxNearestNeighbors[T <: LSHModel[T]]( + lsh: LSH[T], + dataset: Dataset[_], + key: Vector, + k: Int, + singleProbing: Boolean): (Double, Double) = { val model = lsh.fit(dataset) // Compute expected @@ -93,7 +99,7 @@ private[ml] object LSHTest { } /** - * Check and compute the precision and recall of approximate similarity join + * Compute the precision and recall of approximate similarity join * @param lsh The lsh instance * @param datasetA One of the datasets to join * @param datasetB Another dataset to join @@ -101,9 +107,11 @@ private[ml] object LSHTest { * @tparam T The class type of lsh * @return A tuple of two doubles, representing precision and recall rate */ - def checkApproxSimilarityJoin[T <: LSHModel[T]] - (lsh: LSH[T], datasetA: Dataset[_], datasetB: Dataset[_], - threshold: Double): (Double, Double) = { + def calculateApproxSimilarityJoin[T <: LSHModel[T]]( + lsh: LSH[T], + datasetA: Dataset[_], + datasetB: Dataset[_], + threshold: Double): (Double, Double) = { val model = lsh.fit(datasetA) val inputCol = model.getInputCol diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala index 875c2827c9548..2d4b890a6e379 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -33,7 +33,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") - val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, mh, 0.75, 0.5) + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) assert(falsePositive < 0.1) assert(falseNegative < 0.1) } @@ -52,7 +52,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { val key: Vector = Vectors.sparse(100, (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(mh, df, key, 20, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) assert(precision >= 0.7) assert(recall >= 0.7) @@ -74,7 +74,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") - val (precision, recall) = LSHTest.checkApproxSimilarityJoin(mh, dfA, dfB, 0.5) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) assert(precision == 1.0) assert(recall >= 0.9) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala index 4653ff98714f2..97b9068a4863a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala @@ -38,7 +38,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(1.0) - val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, rp, 8.0, 2.0) + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 8.0, 2.0) assert(falsePositive < 0.1) assert(falseNegative < 0.1) } @@ -58,7 +58,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(2.5) - val (falsePositive, falseNegative) = LSHTest.checkLSHProperty(df, rp, 3.0, 2.0) + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0) assert(falsePositive < 0.1) assert(falseNegative < 0.1) } @@ -76,7 +76,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(4.0) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 10, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 10, singleProbing = true) assert(precision >= 0.7) assert(recall >= 0.7) @@ -95,7 +95,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(1.0) - val (precision, recall) = LSHTest.checkApproxNearestNeighbors(rp, df, key, 100, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = false) assert(precision >= 0.7) assert(recall >= 0.7) @@ -118,7 +118,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(4.0) - val (precision, recall) = LSHTest.checkApproxSimilarityJoin(rp, dfA, dfB, 1.0) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) assert(precision == 1.0) assert(recall >= 0.9) } @@ -135,7 +135,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setBucketLength(4.0) - val (precision, recall) = LSHTest.checkApproxSimilarityJoin(rp, df, df, 3.0) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) assert(recall >= 0.7) } From 396ad603082b2075f86ea38294749bcd3650ee7a Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 16:11:26 -0400 Subject: [PATCH 17/45] Bug fixed. Typo of distCol --- .../test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala | 4 ++-- .../scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala index 8fb09c7910561..318b9e15eadde 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala @@ -91,7 +91,7 @@ private[ml] object LSHTest { val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k) // Compute actual - val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing) + val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol") // Compute precision and recall val correctCount = expected.join(actual, model.getInputCol).count().toDouble @@ -124,7 +124,7 @@ private[ml] object LSHTest { val actual = model.approxSimilarityJoin(datasetA, datasetB, threshold) // Compute precision and recall - val correctCount = actual.filter(col("distance") < threshold).count().toDouble + val correctCount = actual.filter(col("distCol") < threshold).count().toDouble (correctCount / actual.count(), correctCount / expected.count()) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala index 2d4b890a6e379..f507000982e73 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala @@ -34,7 +34,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) - assert(falsePositive < 0.1) + assert(falsePositive < 0.3) assert(falseNegative < 0.1) } From b79ebbddede74cae0449f55b4aa69423d67ba07a Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 16:25:10 -0400 Subject: [PATCH 18/45] Fix Jenkins Build. Explicitly annotate type of modelDataset --- .../main/scala/org/apache/spark/ml/feature/lsh/LSH.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala index 7f35b0439a30b..3779fabbbc695 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala @@ -142,10 +142,10 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] assert(k > 0, "The number of nearest neighbors cannot be less than 1") // Get Hash Value of the key v val keyHash = hashFunction(key) - val modelDataset = if (!dataset.columns.contains($(outputCol))) { + val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { transform(dataset) } else { - dataset + dataset.toDF() } // In the origin dataset, find the hash value u that is closest to v @@ -187,10 +187,10 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] } val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, MapType(DataTypes.IntegerType, DataTypes.DoubleType)) - val modelDataset = if (!dataset.columns.contains($(outputCol))) { + val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { transform(dataset) } else { - dataset + dataset.toDF() } modelDataset.select(col("*"), explode(vectorToMap(col($(outputCol)))).as(explodeCols)) } From 7936315e89c299e3e00e08e1c2338a555457753e Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 16:32:36 -0400 Subject: [PATCH 19/45] Move all code to org.apache.spark.ml.feature --- .../scala/org/apache/spark/ml/feature/{lsh => }/LSH.scala | 2 +- .../scala/org/apache/spark/ml/feature/{lsh => }/MinHash.scala | 4 ++-- .../apache/spark/ml/feature/{lsh => }/RandomProjection.scala | 2 +- .../scala/org/apache/spark/ml/feature/{lsh => }/LSHTest.scala | 2 +- .../org/apache/spark/ml/feature/{lsh => }/MinHashSuite.scala | 4 ++-- .../spark/ml/feature/{lsh => }/RandomProjectionSuite.scala | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) rename mllib/src/main/scala/org/apache/spark/ml/feature/{lsh => }/LSH.scala (99%) rename mllib/src/main/scala/org/apache/spark/ml/feature/{lsh => }/MinHash.scala (97%) rename mllib/src/main/scala/org/apache/spark/ml/feature/{lsh => }/RandomProjection.scala (98%) rename mllib/src/test/scala/org/apache/spark/ml/feature/{lsh => }/LSHTest.scala (99%) rename mllib/src/test/scala/org/apache/spark/ml/feature/{lsh => }/MinHashSuite.scala (96%) rename mllib/src/test/scala/org/apache/spark/ml/feature/{lsh => }/RandomProjectionSuite.scala (98%) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala similarity index 99% rename from mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala rename to mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 3779fabbbc695..53307e9324dd6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature import scala.util.Random diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala similarity index 97% rename from mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala rename to mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 594d7c87a5b64..c22f490f03bd2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -15,11 +15,11 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature import scala.util.Random -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{Vectors, Vector} import org.apache.spark.ml.util.Identifiable class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala similarity index 98% rename from mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala rename to mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 0cf1ec06c890e..8de1349ecf777 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/lsh/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature import scala.util.Random diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala similarity index 99% rename from mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala rename to mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index 318b9e15eadde..69a633d11f62c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Dataset diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala similarity index 96% rename from mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala rename to mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index f507000982e73..861728cd2c884 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -15,10 +15,10 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.MLlibTestSparkContext class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala similarity index 98% rename from mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala rename to mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index 97b9068a4863a..c85ce16da325b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/lsh/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.spark.ml.feature.lsh +package org.apache.spark.ml.feature -import breeze.numerics.{cos, sin} +import breeze.numerics.{sin, cos} import breeze.numerics.constants.Pi import org.apache.spark.SparkFunSuite From f80565806210d6ed6d895631dbd7b29d935d5485 Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 17:48:25 -0400 Subject: [PATCH 20/45] Tune threshold for approxNearestNeighbors unit tests --- .../spark/ml/feature/RandomProjectionSuite.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index c85ce16da325b..610c53e7904ee 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.feature -import breeze.numerics.{sin, cos} +import breeze.numerics.{cos, sin} import breeze.numerics.constants.Pi import org.apache.spark.SparkFunSuite @@ -78,8 +78,8 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 10, singleProbing = true) - assert(precision >= 0.7) - assert(recall >= 0.7) + assert(precision >= 0.6) + assert(recall >= 0.6) } test("approxNearestNeighbors with multiple probing") { @@ -97,8 +97,8 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = false) - assert(precision >= 0.7) - assert(recall >= 0.7) + assert(precision >= 0.6) + assert(recall >= 0.6) } test("approxSimilarityJoin for random projection on different dataset") { From 8f04ee834db3fb086588778c87a553ab733baaa4 Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 18:02:26 -0400 Subject: [PATCH 21/45] Fix import ordering --- mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index c22f490f03bd2..4e148bf0e96e7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import scala.util.Random -import org.apache.spark.ml.linalg.{Vectors, Vector} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) From f82f3fed266d47296f608820b42aa80e40ae2d5c Mon Sep 17 00:00:00 2001 From: Yunni Date: Wed, 28 Sep 2016 18:30:05 -0400 Subject: [PATCH 22/45] Add scaladoc for overloaded methods --- .../scala/org/apache/spark/ml/feature/LSH.scala | 16 ++++++++++++++++ .../apache/spark/ml/feature/MinHashSuite.scala | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 53307e9324dd6..c6a3f5164507b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -170,6 +170,15 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] modelSubsetWithDistCol.sort(distCol).limit(k) } + /** + * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search + * nearest neighbors and "distCol" as default distCol. + * @param dataset the dataset to look for the key + * @param key The key to hash for the item + * @param k The maximum number of items closest to the key + * @return A dataset containing at most k items closest to the key. A distCol is added to show + * the distance between each record and the key. + */ def approxNearestNeighbors(dataset: Dataset[_], key: Vector, k: Int): Dataset[_] = { approxNearestNeighbors(dataset, key, k, true, "distCol") } @@ -253,6 +262,13 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() } + /** + * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol. + * @param datasetA One of the datasets to join + * @param datasetB Another dataset to join + * @param threshold The threshold for the distance of record pairs + * @return + */ def approxSimilarityJoin( datasetA: Dataset[_], datasetB: Dataset[_], diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 861728cd2c884..27f2ace36f2ff 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vectors, Vector} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { From ccd98f7bf9f651267f8ea779133fa80f77887055 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 4 Oct 2016 02:53:49 -0400 Subject: [PATCH 23/45] Code review comments --- .../org/apache/spark/ml/feature/LSH.scala | 168 +++++++++--------- .../org/apache/spark/ml/feature/MinHash.scala | 35 ++-- .../spark/ml/feature/RandomProjection.scala | 34 +++- .../spark/ml/feature/MinHashSuite.scala | 2 +- .../ml/feature/RandomProjectionSuite.scala | 2 +- 5 files changed, 133 insertions(+), 108 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index c6a3f5164507b..f0e9309fb57d7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import scala.util.Random +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} @@ -32,28 +33,33 @@ import org.apache.spark.sql.types._ /** * Params for [[LSH]]. */ +@Experimental +@Since("2.1.0") private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Param for the dimension of LSH OR-amplification. * - * In this implementation, we use LSH OR-amplification to reduce the false negative rate. This - * param is the dimension of the amplification. The higher the dimension is, the lower the false - * negative rate. + * In this implementation, we use LSH OR-amplification to reduce the false negative rate. The + * higher the dimension is, the lower the false negative rate. * @group param */ - final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension", - ParamValidators.gt(0)) + @Since("2.1.0") + final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" + + "increasing dimensionality lowers the false negative rate", ParamValidators.gt(0)) /** @group getParam */ + @Since("2.1.0") final def getOutputDim: Int = $(outputDim) - setDefault(outputDim -> 1, outputCol -> "lsh_output") + // TODO: Decide about this default. It should probably depend on the particular LSH algorithm. + setDefault(outputDim -> 1, outputCol -> "lshFeatures") /** * Transform the Schema for LSH * @param schema The schema of the input dataset without outputCol * @return A derived schema with outputCol added */ + @Since("2.1.0") protected[this] final def validateAndTransformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT) } @@ -62,85 +68,80 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Model produced by [[LSH]]. */ -abstract class LSHModel[T <: LSHModel[T]] private[ml] - extends Model[T] with LSHParams { +@Experimental +@Since("2.1.0") +private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHParams { + self: T => + + @Since("2.1.0") override def copy(extra: ParamMap): T = defaultCopy(extra) + /** - * :: DeveloperApi :: - * * The hash function of LSH, mapping a predefined KeyType to a Vector * @return The mapping of LSH function. */ + @Since("2.1.0") protected[this] val hashFunction: Vector => Vector /** - * :: DeveloperApi :: - * * Calculate the distance between two different keys using the distance metric corresponding * to the hashFunction * @param x One of the point in the metric space * @param y Another the point in the metric space - * @return The distance between x and y in double + * @return The distance between x and y */ + @Since("2.1.0") protected[ml] def keyDistance(x: Vector, y: Vector): Double /** - * :: DeveloperApi :: - * - * Calculate the distance between two different hash Vectors. By default, the distance is the - * minimum distance of two hash values in any dimension. + * Calculate the distance between two different hash Vectors. * * @param x One of the hash vector * @param y Another hash vector - * @return The distance between hash vectors x and y in double + * @return The distance between hash vectors x and y */ + @Since("2.1.0") protected[ml] def hashDistance(x: Vector, y: Vector): Double - /** - * Transforms the input dataset. - */ + @Since("2.1.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(hashFunction, new VectorUDT) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } - /** - * :: DeveloperApi :: - * - * Check transform validity and derive the output schema from the input schema. - * - * Typical implementation should first conduct verification on schema change and parameter - * validity, including complex parameter interaction checks. - */ + @Since("2.1.0") override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } /** * Given a large dataset and an item, approximately find at most k items which have the closest - * distance to the item. + * distance to the item. If the outputCol is missing, the method will transform the data; if the + * the outputCol exists, it will use the outputCol. This allows caching of the transformed data + * when necessary. * - * This method has implemented two way of fetching k nearest neighbors: - * Single Probing: Fast, return at most k elements (Probing only one buckets) - * Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key) + * This method implements two ways of fetching k nearest neighbors: + * - Single Probing: Fast, return at most k elements (Probing only one buckets) + * - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key) * - * @param dataset the dataset to look for the key - * @param key The key to hash for the item - * @param k The maximum number of items closest to the key + * @param dataset the dataset to search for nearest neighbors of the key + * @param key Feature vector representing the item to search for + * @param numNearestNeighbors The maximum number of nearest neighbors * @param singleProbing True for using Single Probing; false for multiple probing - * @param distCol The column to store the distance between pairs + * @param distCol Output column for storing the distance between each result record and the key * @return A dataset containing at most k items closest to the key. A distCol is added to show * the distance between each record and the key. */ + @Since("2.1.0") def approxNearestNeighbors( - dataset: Dataset[_], - key: Vector, - k: Int, - singleProbing: Boolean, - distCol: String): Dataset[_] = { - assert(k > 0, "The number of nearest neighbors cannot be less than 1") - // Get Hash Value of the key v + @Since("2.1.0") dataset: Dataset[_], + @Since("2.1.0") key: Vector, + @Since("2.1.0") numNearestNeighbors: Int, + @Since("2.1.0") singleProbing: Boolean, + @Since("2.1.0") distCol: String): Dataset[_] = { + require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1") + // Get Hash Value of the key val keyHash = hashFunction(key) val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { transform(dataset) @@ -148,7 +149,7 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] dataset.toDF() } - // In the origin dataset, find the hash value u that is closest to v + // In the origin dataset, find the hash value that is closest to the key val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType) val hashDistCol = hashDistUDF(col($(outputCol))) @@ -156,9 +157,9 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] modelDataset.filter(hashDistCol === 0.0) } else { // Compute threshold to get exact k elements. - val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(k) + val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(numNearestNeighbors) val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol)) - val hashThreshold = thresholdDataset.collect()(0)(0).asInstanceOf[Double] + val hashThreshold = thresholdDataset.take(1).head.getDouble(0) // Filter the dataset where the hash value is less than the threshold. modelDataset.filter(hashDistCol <= hashThreshold) @@ -167,20 +168,19 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] // Get the top k nearest neighbor by their distance to the key val keyDistUDF = udf((x: Vector) => keyDistance(x, key), DataTypes.DoubleType) val modelSubsetWithDistCol = modelSubset.withColumn(distCol, keyDistUDF(col($(inputCol)))) - modelSubsetWithDistCol.sort(distCol).limit(k) + modelSubsetWithDistCol.sort(distCol).limit(numNearestNeighbors) } /** * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search * nearest neighbors and "distCol" as default distCol. - * @param dataset the dataset to look for the key - * @param key The key to hash for the item - * @param k The maximum number of items closest to the key - * @return A dataset containing at most k items closest to the key. A distCol is added to show - * the distance between each record and the key. */ - def approxNearestNeighbors(dataset: Dataset[_], key: Vector, k: Int): Dataset[_] = { - approxNearestNeighbors(dataset, key, k, true, "distCol") + @Since("2.1.0") + def approxNearestNeighbors( + @Since("2.1.0") dataset: Dataset[_], + @Since("2.1.0") key: Vector, + @Since("2.1.0") numNearestNeighbors: Int): Dataset[_] = { + approxNearestNeighbors(dataset, key, numNearestNeighbors, true, "distCol") } /** @@ -190,10 +190,9 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @param explodeCols The alias for the exploded columns, must be a seq of two strings. * @return A dataset containing idCol, inputCol and explodeCols */ + @Since("2.1.0") private[this] def processDataset(dataset: Dataset[_], explodeCols: Seq[String]): Dataset[_] = { - if (explodeCols.size != 2) { - throw new Exception("explodeCols must be two strings.") - } + require(explodeCols.size == 2, "explodeCols must be two strings.") val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, MapType(DataTypes.IntegerType, DataTypes.DoubleType)) val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { @@ -212,8 +211,11 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @param tmpColName A temporary column name which does not conflict with existing columns * @return */ - private[this] def recreateCol(dataset: Dataset[_], colName: String, - tmpColName: String): Dataset[_] = { + @Since("2.1.0") + private[this] def recreateCol( + @Since("2.1.0") dataset: Dataset[_], + @Since("2.1.0") colName: String, + @Since("2.1.0") tmpColName: String): Dataset[_] = { dataset .withColumnRenamed(colName, tmpColName) .withColumn(colName, col(tmpColName)) @@ -226,15 +228,16 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * @param datasetA One of the datasets to join * @param datasetB Another dataset to join * @param threshold The threshold for the distance of record pairs - * @param distCol The column to store the distance between pairs + * @param distCol Output column for storing the distance between each result record and the key * @return A joined dataset containing pairs of records. A distCol is added to show the distance * between each pair of records. */ + @Since("2.1.0") def approxSimilarityJoin( - datasetA: Dataset[_], - datasetB: Dataset[_], - threshold: Double, - distCol: String): Dataset[_] = { + @Since("2.1.0") datasetA: Dataset[_], + @Since("2.1.0") datasetB: Dataset[_], + @Since("2.1.0") threshold: Double, + @Since("2.1.0") distCol: String): Dataset[_] = { val explodeCols = Seq("lsh#entry", "lsh#hashValue") val explodedA = processDataset(datasetA, explodeCols) @@ -264,15 +267,12 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] /** * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol. - * @param datasetA One of the datasets to join - * @param datasetB Another dataset to join - * @param threshold The threshold for the distance of record pairs - * @return */ + @Since("2.1.0") def approxSimilarityJoin( - datasetA: Dataset[_], - datasetB: Dataset[_], - threshold: Double): Dataset[_] = { + @Since("2.1.0") datasetA: Dataset[_], + @Since("2.1.0") datasetB: Dataset[_], + @Since("2.1.0") threshold: Double): Dataset[_] = { approxSimilarityJoin(datasetA, datasetB, threshold, "distCol") } } @@ -292,46 +292,42 @@ abstract class LSHModel[T <: LSHModel[T]] private[ml] * arXiv:1408.2927 (2014). * @tparam T The class type of lsh */ -abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { +@Experimental +@Since("2.1.0") +private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { /** @group setParam */ + @Since("2.1.0") def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ + @Since("2.1.0") def setOutputCol(value: String): this.type = set(outputCol, value) /** @group setParam */ + @Since("2.1.0") def setOutputDim(value: Int): this.type = set(outputDim, value) /** - * :: DeveloperApi :: - * * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have * different initial setting, developer needs to define how their LSHModel is created instead of * using reflection in this abstract class. * @param inputDim The dimension of the input dataset * @return A new LSHModel instance without any params */ + @Since("2.1.0") protected[this] def createRawLSHModel(inputDim: Int): T + @Since("2.1.0") override def copy(extra: ParamMap): Estimator[T] = defaultCopy(extra) - /** - * Fits a model to the input data. - */ + @Since("2.1.0") override def fit(dataset: Dataset[_]): T = { val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size val model = createRawLSHModel(inputDim).setParent(this) copyValues(model) } - /** - * :: DeveloperApi :: - * - * Check transform validity and derive the output schema from the input schema. - * - * Typical implementation should first conduct verification on schema change and parameter - * validity, including complex parameter interaction checks. - */ + @Since("2.1.0") override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 4e148bf0e96e7..fe88971056b3c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -19,35 +19,35 @@ package org.apache.spark.ml.feature import scala.util.Random +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable -class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) +/** + * Model produced by [[MinHash]] + */ +@Experimental +@Since("2.1.0") +private[ml] class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) extends LSHModel[MinHashModel] { + @Since("2.1.0") override protected[this] val hashFunction: Vector => Vector = { elems: Vector => + require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") Vectors.dense(hashFunctions.map( func => elems.toSparse.indices.toList.map(func).min.toDouble ).toArray) } - /** - * :: DeveloperApi :: - * - * Calculate the distance between two different keys using the distance metric corresponding - * to the hashFunction - * - * @param x One of the point in the metric space - * @param y Another the point in the metric space - * @return The distance between x and y in double - */ + @Since("2.1.0") override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { val xSet = x.toSparse.indices.toSet val ySet = y.toSparse.indices.toSet 1 - xSet.intersect(ySet).size.toDouble / xSet.union(ySet).size.toDouble } + @Since("2.1.0") override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min @@ -61,24 +61,31 @@ class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5 * @param uid */ -class MinHash(override val uid: String) extends LSH[MinHashModel] { +@Experimental +@Since("2.1.0") +private[ml] class MinHash(override val uid: String) extends LSH[MinHashModel] { - protected[this] val prime = 2038074743 + private[this] val prime = 2038074743 + @Since("2.1.0") override def setInputCol(value: String): this.type = super.setInputCol(value) + @Since("2.1.0") override def setOutputCol(value: String): this.type = super.setOutputCol(value) + @Since("2.1.0") override def setOutputDim(value: Int): this.type = super.setOutputDim(value) private[this] lazy val randSeq: Seq[Int] = { Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)).take($(outputDim)) } - def this() = { + @Since("2.1.0") + private[ml] def this() = { this(Identifiable.randomUID("min hash")) } + @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { val numEntry = inputDim * 2 assert(numEntry < prime, "The input vector dimension is too large for MinHash to handle.") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 8de1349ecf777..df854000c0782 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -21,6 +21,7 @@ import scala.util.Random import breeze.linalg.normalize +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable @@ -28,19 +29,29 @@ import org.apache.spark.ml.util.Identifiable /** * Params for [[RandomProjection]]. */ +@Experimental +@Since("2.1.0") private[ml] trait RandomProjectionParams extends Params { + @Since("2.1.0") val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", "the length of each hash bucket", ParamValidators.gt(0)) /** @group getParam */ + @Since("2.1.0") final def getBucketLength: Double = $(bucketLength) } -class RandomProjectionModel( - override val uid: String, - val randUnitVectors: Array[Vector]) +/** + * Model produced by [[LSH]] + */ +@Experimental +@Since("2.1.0") +private[ml] class RandomProjectionModel( + @Since("2.1.0") override val uid: String, + @Since("2.1.0") val randUnitVectors: Array[Vector]) extends LSHModel[RandomProjectionModel] with RandomProjectionParams { + @Since("2.1.0") override protected[this] val hashFunction: (Vector) => Vector = { key: Vector => { val hashValues: Array[Double] = randUnitVectors.map({ @@ -50,10 +61,12 @@ class RandomProjectionModel( } } + @Since("2.1.0") override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { Math.sqrt(Vectors.sqdist(x, y)) } + @Since("2.1.0") override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min @@ -62,28 +75,37 @@ class RandomProjectionModel( /** * This [[RandomProjection]] implements Locality Sensitive Hashing functions with 2-stable - * distributions. If you are looking for LSH for cos distance, please use [[SignRandomProjection]] + * distributions. * * References: * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ -class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel] +@Experimental +@Since("2.1.0") +class RandomProjection private[ml] ( + @Since("2.1.0") override val uid: String) extends LSH[RandomProjectionModel] with RandomProjectionParams { + @Since("2.1.0") override def setInputCol(value: String): this.type = super.setInputCol(value) + @Since("2.1.0") override def setOutputCol(value: String): this.type = super.setOutputCol(value) + @Since("2.1.0") override def setOutputDim(value: Int): this.type = super.setOutputDim(value) - def this() = { + @Since("2.1.0") + private[ml] def this() = { this(Identifiable.randomUID("random projection")) } /** @group setParam */ + @Since("2.1.0") def setBucketLength(value: Double): this.type = set(bucketLength, value) + @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { val randUnitVectors: Array[Vector] = { Array.fill($(outputDim)) { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 27f2ace36f2ff..c8abf30cb031a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -34,7 +34,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) - assert(falsePositive < 0.3) + assert(falsePositive < 0.5) assert(falseNegative < 0.1) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index 610c53e7904ee..387946419c306 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -120,7 +120,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) assert(precision == 1.0) - assert(recall >= 0.9) + assert(recall >= 0.8) } test("approxSimilarityJoin for self join") { From 69efc84849894668f3ce8fe59379a92aa36e2cf2 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 4 Oct 2016 03:04:18 -0400 Subject: [PATCH 24/45] Move private[ml] to MinHash constructor --- mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index fe88971056b3c..28fe2bf0c6781 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -59,11 +59,10 @@ private[ml] class MinHashModel(override val uid: String, hashFunctions: Seq[Int * The input set should be represented in sparse vector form. For example, * Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)]) * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5 - * @param uid */ @Experimental @Since("2.1.0") -private[ml] class MinHash(override val uid: String) extends LSH[MinHashModel] { +class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { private[this] val prime = 2038074743 From eced98d435b2a8bc29bd756decdcb81eca9c8bc8 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 4 Oct 2016 11:45:36 -0400 Subject: [PATCH 25/45] Detailed doc on bucketLength. Move private[ml] to Model constructor --- .../src/main/scala/org/apache/spark/ml/feature/MinHash.scala | 2 +- .../scala/org/apache/spark/ml/feature/RandomProjection.scala | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 28fe2bf0c6781..cb3074c38e989 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -28,7 +28,7 @@ import org.apache.spark.ml.util.Identifiable */ @Experimental @Since("2.1.0") -private[ml] class MinHashModel(override val uid: String, hashFunctions: Seq[Int => Long]) +class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int => Long]) extends LSHModel[MinHashModel] { @Since("2.1.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index df854000c0782..f59339bd5552d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -34,7 +34,8 @@ import org.apache.spark.ml.util.Identifiable private[ml] trait RandomProjectionParams extends Params { @Since("2.1.0") val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", - "the length of each hash bucket", ParamValidators.gt(0)) + "the length of each hash bucket, a larger bucket lowers the false negative rate.", + ParamValidators.gt(0)) /** @group getParam */ @Since("2.1.0") @@ -46,7 +47,7 @@ private[ml] trait RandomProjectionParams extends Params { */ @Experimental @Since("2.1.0") -private[ml] class RandomProjectionModel( +class RandomProjectionModel private[ml] ( @Since("2.1.0") override val uid: String, @Since("2.1.0") val randUnitVectors: Array[Vector]) extends LSHModel[RandomProjectionModel] with RandomProjectionParams { From 3487bcc32da26dbad2b3b3eaf294135a09cb47cc Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Tue, 4 Oct 2016 10:09:08 -0700 Subject: [PATCH 26/45] Tune threshold for MinHash --- .../test/scala/org/apache/spark/ml/feature/MinHashSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index c8abf30cb031a..30bee3428dfbe 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -54,8 +54,8 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) - assert(precision >= 0.7) - assert(recall >= 0.7) + assert(precision >= 0.6) + assert(recall >= 0.6) } test("approxSimilarityJoin for minhash on different dataset") { From df198868f8505a307ac2bc1af33ff345f5207be6 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Wed, 5 Oct 2016 10:30:17 -0700 Subject: [PATCH 27/45] Code review comments --- .../org/apache/spark/ml/feature/LSH.scala | 91 ++++++++++--------- .../org/apache/spark/ml/feature/MinHash.scala | 12 ++- .../spark/ml/feature/RandomProjection.scala | 12 ++- 3 files changed, 68 insertions(+), 47 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index f0e9309fb57d7..85f15f8f08856 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -45,7 +45,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { */ @Since("2.1.0") final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" + - "increasing dimensionality lowers the false negative rate", ParamValidators.gt(0)) + "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" + + " improves the running performance", ParamValidators.gt(0)) /** @group getParam */ @Since("2.1.0") @@ -56,8 +57,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Transform the Schema for LSH - * @param schema The schema of the input dataset without outputCol - * @return A derived schema with outputCol added + * @param schema The schema of the input dataset without [[outputCol]] + * @return A derived schema with [[outputCol]] added */ @Since("2.1.0") protected[this] final def validateAndTransformSchema(schema: StructType): StructType = { @@ -117,9 +118,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP /** * Given a large dataset and an item, approximately find at most k items which have the closest - * distance to the item. If the outputCol is missing, the method will transform the data; if the - * the outputCol exists, it will use the outputCol. This allows caching of the transformed data - * when necessary. + * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if + * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the + * transformed data when necessary. * * This method implements two ways of fetching k nearest neighbors: * - Single Probing: Fast, return at most k elements (Probing only one buckets) @@ -135,11 +136,11 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP */ @Since("2.1.0") def approxNearestNeighbors( - @Since("2.1.0") dataset: Dataset[_], - @Since("2.1.0") key: Vector, - @Since("2.1.0") numNearestNeighbors: Int, - @Since("2.1.0") singleProbing: Boolean, - @Since("2.1.0") distCol: String): Dataset[_] = { + dataset: Dataset[_], + key: Vector, + numNearestNeighbors: Int, + singleProbing: Boolean, + distCol: String): Dataset[_] = { require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1") // Get Hash Value of the key val keyHash = hashFunction(key) @@ -177,21 +178,24 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP */ @Since("2.1.0") def approxNearestNeighbors( - @Since("2.1.0") dataset: Dataset[_], - @Since("2.1.0") key: Vector, - @Since("2.1.0") numNearestNeighbors: Int): Dataset[_] = { + dataset: Dataset[_], + key: Vector, + numNearestNeighbors: Int): Dataset[_] = { approxNearestNeighbors(dataset, key, numNearestNeighbors, true, "distCol") } /** - * Preprocess step for approximate similarity join. Transform and explode the outputCol to + * Preprocess step for approximate similarity join. Transform and explode the [[outputCol]] to * explodeCols. * @param dataset The dataset to transform and explode. * @param explodeCols The alias for the exploded columns, must be a seq of two strings. * @return A dataset containing idCol, inputCol and explodeCols */ @Since("2.1.0") - private[this] def processDataset(dataset: Dataset[_], explodeCols: Seq[String]): Dataset[_] = { + private[this] def processDataset( + dataset: Dataset[_], + inputName: String, + explodeCols: Seq[String]): Dataset[_] = { require(explodeCols.size == 2, "explodeCols must be two strings.") val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, MapType(DataTypes.IntegerType, DataTypes.DoubleType)) @@ -200,7 +204,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } else { dataset.toDF() } - modelDataset.select(col("*"), explode(vectorToMap(col($(outputCol)))).as(explodeCols)) + modelDataset.select( + struct(col("*")).as(inputName), + explode(vectorToMap(col($(outputCol)))).as(explodeCols)) } /** @@ -213,9 +219,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP */ @Since("2.1.0") private[this] def recreateCol( - @Since("2.1.0") dataset: Dataset[_], - @Since("2.1.0") colName: String, - @Since("2.1.0") tmpColName: String): Dataset[_] = { + dataset: Dataset[_], + colName: String, + tmpColName: String): Dataset[_] = { dataset .withColumnRenamed(colName, tmpColName) .withColumn(colName, col(tmpColName)) @@ -223,8 +229,11 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** - * Join two dataset to approximately find all pairs of records whose distance are smaller - * than the threshold. + * Join two dataset to approximately find all pairs of records whose distance are smaller than + * the threshold. If the [[outputCol]] is missing, the method will transform the data; if the + * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed + * data when necessary. + * * @param datasetA One of the datasets to join * @param datasetB Another dataset to join * @param threshold The threshold for the distance of record pairs @@ -234,21 +243,22 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP */ @Since("2.1.0") def approxSimilarityJoin( - @Since("2.1.0") datasetA: Dataset[_], - @Since("2.1.0") datasetB: Dataset[_], - @Since("2.1.0") threshold: Double, - @Since("2.1.0") distCol: String): Dataset[_] = { + datasetA: Dataset[_], + datasetB: Dataset[_], + threshold: Double, + distCol: String): Dataset[_] = { - val explodeCols = Seq("lsh#entry", "lsh#hashValue") - val explodedA = processDataset(datasetA, explodeCols) + val explodeCols = Seq("entry", "hashValue") + val inputName = "input" + val explodedA = processDataset(datasetA, inputName, explodeCols) // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. // TODO: Remove recreateCol logic once SPARK-17154 is resolved. val explodedB = if (datasetA != datasetB) { - processDataset(datasetB, explodeCols) + processDataset(datasetB, inputName, explodeCols) } else { val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}") - processDataset(recreatedB, explodeCols) + processDataset(recreatedB, inputName, explodeCols) } // Do a hash join on where the exploded hash values are equal. @@ -258,7 +268,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP // Add a new column to store the distance of the two records. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), - distUDF(explodedA($(inputCol)), explodedB($(inputCol))).as(distCol) + distUDF(explodedA(s"$inputName.${$(inputCol)}"), + explodedB(s"$inputName.${$(inputCol)}")).as(distCol) ) // Filter the joined datasets where the distance are smaller than the threshold. @@ -270,9 +281,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP */ @Since("2.1.0") def approxSimilarityJoin( - @Since("2.1.0") datasetA: Dataset[_], - @Since("2.1.0") datasetB: Dataset[_], - @Since("2.1.0") threshold: Double): Dataset[_] = { + datasetA: Dataset[_], + datasetB: Dataset[_], + threshold: Double): Dataset[_] = { approxSimilarityJoin(datasetA, datasetB, threshold, "distCol") } } @@ -282,19 +293,17 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * hash column, approximate nearest neighbor search with a dataset and a key, and approximate * similarity join of two datasets. * - * Currently the following LSH family is implemented: - * - Euclidean Distance: Random Projection - * * References: * (1) Gionis, Aristides, Piotr Indyk, and Rajeev Motwani. "Similarity search in high dimensions * via hashing." VLDB 7 Sep. 1999: 518-529. * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). - * @tparam T The class type of lsh */ @Experimental @Since("2.1.0") private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { + self: Estimator[T] => + /** @group setParam */ @Since("2.1.0") def setInputCol(value: String): this.type = set(inputCol, value) @@ -322,13 +331,9 @@ private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHPa @Since("2.1.0") override def fit(dataset: Dataset[_]): T = { + transformSchema(dataset.schema, logging = true) val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size val model = createRawLSHModel(inputDim).setParent(this) copyValues(model) } - - @Since("2.1.0") - override def transformSchema(schema: StructType): StructType = { - validateAndTransformSchema(schema) - } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index cb3074c38e989..8a23def578198 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -20,8 +20,9 @@ package org.apache.spark.ml.feature import scala.util.Random import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.types.StructType /** * Model produced by [[MinHash]] @@ -87,7 +88,7 @@ class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { val numEntry = inputDim * 2 - assert(numEntry < prime, "The input vector dimension is too large for MinHash to handle.") + require(numEntry < prime, "The input vector dimension is too large for MinHash to handle.") val hashFunctions: Seq[Int => Long] = { (0 until $(outputDim)).map { i: Int => // Perfect Hash function, use 2n buckets to reduce collision. @@ -96,4 +97,11 @@ class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { } new MinHashModel(uid, hashFunctions) } + + @Since("2.1.0") + override def transformSchema(schema: StructType): StructType = { + require(schema.apply($(inputCol)).dataType.sameType(new VectorUDT), + s"${$(inputCol)} must be vectors") + validateAndTransformSchema(schema) + } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index f59339bd5552d..835b328ad4abe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -22,9 +22,10 @@ import scala.util.Random import breeze.linalg.normalize import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.types.StructType /** * Params for [[RandomProjection]]. @@ -43,7 +44,7 @@ private[ml] trait RandomProjectionParams extends Params { } /** - * Model produced by [[LSH]] + * Model produced by [[RandomProjection]] */ @Experimental @Since("2.1.0") @@ -116,4 +117,11 @@ class RandomProjection private[ml] ( } new RandomProjectionModel(uid, randUnitVectors) } + + @Since("2.1.0") + override def transformSchema(schema: StructType): StructType = { + require(schema.apply($(inputCol)).dataType.sameType(new VectorUDT), + s"${$(inputCol)} must be vectors") + validateAndTransformSchema(schema) + } } From efe323cd69b87cea6a19d39be0e480e9322b5fe5 Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 10 Oct 2016 11:49:57 -0400 Subject: [PATCH 28/45] Code Review Comments --- .../org/apache/spark/ml/feature/LSH.scala | 48 +++++++------ .../org/apache/spark/ml/feature/MinHash.scala | 68 ++++++++++++++----- .../spark/ml/feature/RandomProjection.scala | 56 +++++++++++---- .../org/apache/spark/ml/feature/LSHTest.scala | 23 ++++--- .../spark/ml/feature/MinHashSuite.scala | 13 ++-- .../ml/feature/RandomProjectionSuite.scala | 28 +++++--- 6 files changed, 158 insertions(+), 78 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 85f15f8f08856..f7ca0a913f870 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -26,14 +26,13 @@ import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.sql._ -import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** + * :: Experimental :: * Params for [[LSH]]. */ -@Experimental @Since("2.1.0") private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** @@ -52,9 +51,6 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { @Since("2.1.0") final def getOutputDim: Int = $(outputDim) - // TODO: Decide about this default. It should probably depend on the particular LSH algorithm. - setDefault(outputDim -> 1, outputCol -> "lshFeatures") - /** * Transform the Schema for LSH * @param schema The schema of the input dataset without [[outputCol]] @@ -67,6 +63,7 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { } /** + * :: Experimental :: * Model produced by [[LSH]]. */ @Experimental @@ -87,8 +84,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP /** * Calculate the distance between two different keys using the distance metric corresponding * to the hashFunction - * @param x One of the point in the metric space - * @param y Another the point in the metric space + * @param x One input vector in the metric space + * @param y One input vector in the metric space * @return The distance between x and y */ @Since("2.1.0") @@ -186,7 +183,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP /** * Preprocess step for approximate similarity join. Transform and explode the [[outputCol]] to - * explodeCols. + * two explodeCols: entry and value. "entry" is the index in hash vector, and "value" is the + * value of corresponding value of the index in the vector. + * * @param dataset The dataset to transform and explode. * @param explodeCols The alias for the exploded columns, must be a seq of two strings. * @return A dataset containing idCol, inputCol and explodeCols @@ -194,19 +193,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP @Since("2.1.0") private[this] def processDataset( dataset: Dataset[_], - inputName: String, explodeCols: Seq[String]): Dataset[_] = { - require(explodeCols.size == 2, "explodeCols must be two strings.") - val vectorToMap: UserDefinedFunction = udf((x: Vector) => x.asBreeze.iterator.toMap, - MapType(DataTypes.IntegerType, DataTypes.DoubleType)) - val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { + if (!dataset.columns.contains($(outputCol))) { transform(dataset) } else { dataset.toDF() } - modelDataset.select( - struct(col("*")).as(inputName), - explode(vectorToMap(col($(outputCol)))).as(explodeCols)) } /** @@ -249,31 +241,32 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP distCol: String): Dataset[_] = { val explodeCols = Seq("entry", "hashValue") - val inputName = "input" - val explodedA = processDataset(datasetA, inputName, explodeCols) + val explodedA = processDataset(datasetA, explodeCols) // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. // TODO: Remove recreateCol logic once SPARK-17154 is resolved. val explodedB = if (datasetA != datasetB) { - processDataset(datasetB, inputName, explodeCols) + processDataset(datasetB, explodeCols) } else { val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}") - processDataset(recreatedB, inputName, explodeCols) + processDataset(recreatedB, explodeCols) } + val shareBucketUDF = udf((x: Vector, y: Vector) => hashDistance(x, y) == 0, + DataTypes.BooleanType) + // Do a hash join on where the exploded hash values are equal. - val joinedDataset = explodedA.join(explodedB, explodeCols) - .drop(explodeCols: _*) + val joinedDataset = explodedA.join(explodedB, shareBucketUDF(explodedA($(outputCol)), explodedB($(outputCol)))) // Add a new column to store the distance of the two records. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), - distUDF(explodedA(s"$inputName.${$(inputCol)}"), - explodedB(s"$inputName.${$(inputCol)}")).as(distCol) + distUDF(explodedA(s"${$(inputCol)}"), + explodedB(s"${$(inputCol)}")).as(distCol) ) // Filter the joined datasets where the distance are smaller than the threshold. - joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() + joinedDatasetWithDist.filter(col(distCol) < threshold) } /** @@ -289,10 +282,15 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** + * :: Experimental :: * Locality Sensitive Hashing for different metrics space. Support basic transformation with a new * hash column, approximate nearest neighbor search with a dataset and a key, and approximate * similarity join of two datasets. * + * This LSH class implements OR-amplification: more than 1 hash functions can be chosen, and each + * input vector are hashed by all hash functions. Two input vectors are defined to be in the same + * bucket as long as ANY one of the hash value matches. + * * References: * (1) Gionis, Aristides, Piotr Indyk, and Rajeev Motwani. "Similarity search in high dimensions * via hashing." VLDB 7 Sep. 1999: 518-529. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 8a23def578198..a742eda3f3d70 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -21,11 +21,34 @@ import scala.util.Random import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.param.{BooleanParam, Params} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.types.StructType /** + * :: Experimental :: + * Params for [[MinHash]]. + */ +@Since("2.1.0") +private[ml] trait MinHashParams extends Params { + + /** + * If true, set the random seed to 0. Otherwise, use default setting in scala.util.Random + * @group param + */ + @Since("2.1.0") + val hasSeed: BooleanParam = new BooleanParam(this, "hasSeed", + "If true, set the random seed to 0.") + + /** @group getParam */ + @Since("2.1.0") + final def getHasSeed: Boolean = $(hasSeed) +} + +/** + * :: Experimental :: * Model produced by [[MinHash]] + * @param hashFunctions A seq of hash functions, mapping elements to their hash values. */ @Experimental @Since("2.1.0") @@ -36,8 +59,9 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int override protected[this] val hashFunction: Vector => Vector = { elems: Vector => require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") + val elemsList = elems.toSparse.indices.toList Vectors.dense(hashFunctions.map( - func => elems.toSparse.indices.toList.map(func).min.toDouble + func => elemsList.map(func).min.toDouble ).toArray) } @@ -45,7 +69,10 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { val xSet = x.toSparse.indices.toSet val ySet = y.toSparse.indices.toSet - 1 - xSet.intersect(ySet).size.toDouble / xSet.union(ySet).size.toDouble + val intersectionSize = xSet.intersect(ySet).size.toDouble + val unionSize = xSet.union(ySet).size.toDouble + assert(unionSize > 0, "The union of two input sets must have at least 1 elements") + 1 - intersectionSize / unionSize } @Since("2.1.0") @@ -56,15 +83,20 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int } /** - * LSH class for Jaccard distance - * The input set should be represented in sparse vector form. For example, - * Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)]) - * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5 + * :: Experimental :: + * LSH class for Jaccard distance. + * + * The input can be dense or sparse vectors, but it is more efficient if it is sparse. For example, + * `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])` + * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5. + * Also, any input vector must have at least 1 non-zero indices, and all non-zero values are treated + * as binary "1" values. */ @Experimental @Since("2.1.0") -class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { +class MinHash(override val uid: String) extends LSH[MinHashModel] with MinHashParams { + // A large prime smaller than sqrt(2^63 − 1) private[this] val prime = 2038074743 @Since("2.1.0") @@ -76,19 +108,24 @@ class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { @Since("2.1.0") override def setOutputDim(value: Int): this.type = super.setOutputDim(value) - private[this] lazy val randSeq: Seq[Int] = { - Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)).take($(outputDim)) - } - @Since("2.1.0") - private[ml] def this() = { + def this() = { this(Identifiable.randomUID("min hash")) } + setDefault(outputDim -> 1, outputCol -> "lshFeatures", hasSeed -> false) + + @Since("2.1.0") + def setHasSeed(value: Boolean): this.type = set(hasSeed, value) + @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { + require(inputDim <= prime / 2, "The input vector dimension is too large for MinHash to handle.") + if ($(hasSeed)) Random.setSeed(0) val numEntry = inputDim * 2 - require(numEntry < prime, "The input vector dimension is too large for MinHash to handle.") + val randSeq: Seq[Int] = { + Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)) + } val hashFunctions: Seq[Int => Long] = { (0 until $(outputDim)).map { i: Int => // Perfect Hash function, use 2n buckets to reduce collision. @@ -100,8 +137,7 @@ class MinHash private[ml] (override val uid: String) extends LSH[MinHashModel] { @Since("2.1.0") override def transformSchema(schema: StructType): StructType = { - require(schema.apply($(inputCol)).dataType.sameType(new VectorUDT), - s"${$(inputCol)} must be vectors") + SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) validateAndTransformSchema(schema) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 835b328ad4abe..7206d3f8fa510 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -23,34 +23,56 @@ import breeze.linalg.normalize import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Params, ParamValidators} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.types.StructType /** + * :: Experimental :: * Params for [[RandomProjection]]. */ -@Experimental @Since("2.1.0") private[ml] trait RandomProjectionParams extends Params { + + /** + * The length of each hash bucket, a larger bucket lowers the false negative rate. + * + * If input vectors are normalized, 1-10 times of pow(numRecords, -1/inputDim) would be a + * reasonable value + * @group param + */ @Since("2.1.0") val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", "the length of each hash bucket, a larger bucket lowers the false negative rate.", ParamValidators.gt(0)) + /** + * If true, set the random seed to 0. Otherwise, use default setting in scala.util.Random + * @group param + */ + @Since("2.1.0") + val hasSeed: BooleanParam = new BooleanParam(this, "hasSeed", + "If true, set the random seed to 0.") + + /** @group getParam */ + @Since("2.1.0") + final def getHasSeed: Boolean = $(hasSeed) + /** @group getParam */ @Since("2.1.0") final def getBucketLength: Double = $(bucketLength) } /** + * :: Experimental :: * Model produced by [[RandomProjection]] + * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function. */ @Experimental @Since("2.1.0") class RandomProjectionModel private[ml] ( - @Since("2.1.0") override val uid: String, - @Since("2.1.0") val randUnitVectors: Array[Vector]) + override val uid: String, + val randUnitVectors: Array[Vector]) extends LSHModel[RandomProjectionModel] with RandomProjectionParams { @Since("2.1.0") @@ -76,8 +98,13 @@ class RandomProjectionModel private[ml] ( } /** - * This [[RandomProjection]] implements Locality Sensitive Hashing functions with 2-stable - * distributions. + * :: Experimental :: + * This [[RandomProjection]] implements Locality Sensitive Hashing functions for Euclidean + * distance metrics. + * + * The input is dense or sparse vectors, each of which represents a point in the Euclidean + * distance space. The output will be vectors of configurable dimension. Hash value in the same + * dimension is calculated by the same hash function. * * References: * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint @@ -85,8 +112,7 @@ class RandomProjectionModel private[ml] ( */ @Experimental @Since("2.1.0") -class RandomProjection private[ml] ( - @Since("2.1.0") override val uid: String) extends LSH[RandomProjectionModel] +class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel] with RandomProjectionParams { @Since("2.1.0") @@ -99,16 +125,23 @@ class RandomProjection private[ml] ( override def setOutputDim(value: Int): this.type = super.setOutputDim(value) @Since("2.1.0") - private[ml] def this() = { + def this() = { this(Identifiable.randomUID("random projection")) } + setDefault(outputDim -> 1, outputCol -> "lshFeatures", hasSeed -> false) + /** @group setParam */ @Since("2.1.0") def setBucketLength(value: Double): this.type = set(bucketLength, value) + /** @group setParam */ + @Since("2.1.0") + def setHasSeed(value: Boolean): this.type = set(hasSeed, value) + @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { + if ($(hasSeed)) Random.setSeed(0) val randUnitVectors: Array[Vector] = { Array.fill($(outputDim)) { val randArray = Array.fill(inputDim)(Random.nextGaussian()) @@ -120,8 +153,7 @@ class RandomProjection private[ml] ( @Since("2.1.0") override def transformSchema(schema: StructType): StructType = { - require(schema.apply($(inputCol)).dataType.sameType(new VectorUDT), - s"${$(inputCol)} must be vectors") + SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) validateAndTransformSchema(schema) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index 69a633d11f62c..b3a32b4aaa76f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -27,26 +27,31 @@ private[ml] object LSHTest { * For any locality sensitive function h in a metric space, we meed to verify whether * the following property is satisfied. * - * There exist d1, d2, p1, p2, so that for any two elements e1 and e2, - * If dist(e1, e2) >= dist1, then Pr{h(x) == h(y)} >= p1 - * If dist(e1, e2) <= dist2, then Pr{h(x) != h(y)} <= p2 + * There exist dist1, dist2, p1, p2, so that for any two elements e1 and e2, + * If dist(e1, e2) <= dist1, then Pr{h(x) == h(y)} >= p1 + * If dist(e1, e2) >= dist2, then Pr{h(x) == h(y)} <= p2 * * This is called locality sensitive property. This method checks the property on an * existing dataset and calculate the probabilities. * (https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Definition) * + * This method hashes each elements to hash buckets using LSH, and calculate the false positive + * and false negative: + * False positive: Of all (e1, e2) sharing any bucket, the probability of dist(e1, e2) > distFP + * False positive: Of all (e1, e2) not sharing buckets, the probability of dist(e1, e2) < distFN + * * @param dataset The dataset to verify the locality sensitive hashing property. * @param lsh The lsh instance to perform the hashing - * @param dist1 Distance threshold for false positive - * @param dist2 Distance threshold for false negative + * @param distFP Distance threshold for false positive + * @param distFN Distance threshold for false negative * @tparam T The class type of lsh * @return A tuple of two doubles, representing the false positive and false negative rate */ def calculateLSHProperty[T <: LSHModel[T]]( dataset: Dataset[_], lsh: LSH[T], - dist1: Double, - dist2: Double): (Double, Double) = { + distFP: Double, + distFN: Double): (Double, Double) = { val model = lsh.fit(dataset) val inputCol = model.getInputCol val outputCol = model.getOutputCol @@ -64,8 +69,8 @@ private[ml] object LSHTest { // Compute the probabilities based on the join result val positive = result.filter(col("same_bucket")) val negative = result.filter(!col("same_bucket")) - val falsePositiveCount = positive.filter(col("distance") > dist1).count().toDouble - val falseNegativeCount = negative.filter(col("distance") < dist2).count().toDouble + val falsePositiveCount = positive.filter(col("distance") > distFP).count().toDouble + val falseNegativeCount = negative.filter(col("distance") < distFN).count().toDouble (falsePositiveCount / positive.count(), falseNegativeCount / negative.count()) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 30bee3428dfbe..93a194f64cb4d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -32,10 +32,11 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") + .setHasSeed(true) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) - assert(falsePositive < 0.5) - assert(falseNegative < 0.1) + assert(falsePositive < 0.03) + assert(falseNegative < 0.01) } test("approxNearestNeighbors for min hash") { @@ -48,14 +49,15 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") + .setHasSeed(true) val key: Vector = Vectors.sparse(100, (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) - assert(precision >= 0.6) - assert(recall >= 0.6) + assert(precision >= 0.9) + assert(recall >= 0.9) } test("approxSimilarityJoin for minhash on different dataset") { @@ -73,9 +75,10 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") + .setHasSeed(true) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) assert(precision == 1.0) - assert(recall >= 0.9) + assert(recall == 1.0) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index 387946419c306..f7d838b08e9b5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -37,10 +37,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) + .setHasSeed(true) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 8.0, 2.0) - assert(falsePositive < 0.1) - assert(falseNegative < 0.1) + assert(falsePositive < 0.07) + assert(falseNegative < 0.05) } test("RandomProjection with high dimension data") { @@ -57,10 +58,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(2.5) + .setHasSeed(true) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0) - assert(falsePositive < 0.1) - assert(falseNegative < 0.1) + assert(falsePositive == 0.0) + assert(falseNegative < 0.03) } test("approxNearestNeighbors for random projection") { @@ -75,11 +77,12 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) + .setHasSeed(true) - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 10, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = true) - assert(precision >= 0.6) - assert(recall >= 0.6) + assert(precision >= 0.7) + assert(recall >= 0.7) } test("approxNearestNeighbors with multiple probing") { @@ -94,11 +97,12 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) + .setHasSeed(true) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = false) - assert(precision >= 0.6) - assert(recall >= 0.6) + assert(precision >= 0.75) + assert(recall >= 0.75) } test("approxSimilarityJoin for random projection on different dataset") { @@ -117,10 +121,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) + .setHasSeed(true) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) assert(precision == 1.0) - assert(recall >= 0.8) + assert(recall >= 0.95) } test("approxSimilarityJoin for self join") { @@ -134,9 +139,10 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) + .setHasSeed(true) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) - assert(recall >= 0.7) + assert(recall == 1.0) } } From 142d8e96f7c7e5ef80b3fe11ada1be9cd499bc8a Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 10 Oct 2016 12:17:22 -0400 Subject: [PATCH 29/45] Revert unrelated changes --- .../org/apache/spark/ml/feature/LSH.scala | 28 +++++++++++-------- .../ml/feature/RandomProjectionSuite.scala | 2 +- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index f7ca0a913f870..654254136443d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -193,12 +193,19 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP @Since("2.1.0") private[this] def processDataset( dataset: Dataset[_], + inputName: String, explodeCols: Seq[String]): Dataset[_] = { - if (!dataset.columns.contains($(outputCol))) { + require(explodeCols.size == 2, "explodeCols must be two strings.") + val vectorToMap = udf((x: Vector) => x.asBreeze.iterator.toMap, + MapType(DataTypes.IntegerType, DataTypes.DoubleType)) + val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) { transform(dataset) } else { dataset.toDF() } + modelDataset.select( + struct(col("*")).as(inputName), + explode(vectorToMap(col($(outputCol)))).as(explodeCols)) } /** @@ -241,32 +248,31 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP distCol: String): Dataset[_] = { val explodeCols = Seq("entry", "hashValue") - val explodedA = processDataset(datasetA, explodeCols) + val inputName = "input" + val explodedA = processDataset(datasetA, inputName, explodeCols) // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. // TODO: Remove recreateCol logic once SPARK-17154 is resolved. val explodedB = if (datasetA != datasetB) { - processDataset(datasetB, explodeCols) + processDataset(datasetB, inputName, explodeCols) } else { val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}") - processDataset(recreatedB, explodeCols) + processDataset(recreatedB, inputName, explodeCols) } - val shareBucketUDF = udf((x: Vector, y: Vector) => hashDistance(x, y) == 0, - DataTypes.BooleanType) - // Do a hash join on where the exploded hash values are equal. - val joinedDataset = explodedA.join(explodedB, shareBucketUDF(explodedA($(outputCol)), explodedB($(outputCol)))) + val joinedDataset = explodedA.join(explodedB, explodeCols) + .drop(explodeCols: _*) // Add a new column to store the distance of the two records. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), - distUDF(explodedA(s"${$(inputCol)}"), - explodedB(s"${$(inputCol)}")).as(distCol) + distUDF(explodedA(s"$inputName.${$(inputCol)}"), + explodedB(s"$inputName.${$(inputCol)}")).as(distCol) ) // Filter the joined datasets where the distance are smaller than the threshold. - joinedDatasetWithDist.filter(col(distCol) < threshold) + joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index f7d838b08e9b5..63c26fad62d1e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -143,6 +143,6 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) - assert(recall == 1.0) + assert(recall >= 0.9) } } From 40d1f1b077232a8feeb2dd66d9b846ded1839e63 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Mon, 10 Oct 2016 13:23:12 -0700 Subject: [PATCH 30/45] Code review comments for MinHash: (1) Compute unionSize based on setSizes and intersectionSize (2) hash functions generated from randSeq --- .../main/scala/org/apache/spark/ml/feature/MinHash.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index a742eda3f3d70..98b1df4cf655b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -70,7 +70,7 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int val xSet = x.toSparse.indices.toSet val ySet = y.toSparse.indices.toSet val intersectionSize = xSet.intersect(ySet).size.toDouble - val unionSize = xSet.union(ySet).size.toDouble + val unionSize = xSet.size + ySet.size - intersectionSize assert(unionSize > 0, "The union of two input sets must have at least 1 elements") 1 - intersectionSize / unionSize } @@ -127,9 +127,9 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with MinHashPa Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)) } val hashFunctions: Seq[Int => Long] = { - (0 until $(outputDim)).map { i: Int => + randSeq.map { randCoefficient: Int => // Perfect Hash function, use 2n buckets to reduce collision. - elem: Int => (1 + elem) * randSeq(i).toLong % prime % numEntry + elem: Int => (1 + elem) * randCoefficient.toLong % prime % numEntry } } new MinHashModel(uid, hashFunctions) From 2c95e5c1d89e2db0350b5d8667e2ae8d293df7a9 Mon Sep 17 00:00:00 2001 From: Yunni Date: Tue, 11 Oct 2016 00:11:15 -0400 Subject: [PATCH 31/45] Code review comments --- .../org/apache/spark/ml/feature/LSH.scala | 9 ++--- .../org/apache/spark/ml/feature/MinHash.scala | 33 ++++--------------- .../spark/ml/feature/RandomProjection.scala | 25 ++++---------- .../org/apache/spark/ml/feature/LSHTest.scala | 2 +- .../spark/ml/feature/MinHashSuite.scala | 6 ++-- .../ml/feature/RandomProjectionSuite.scala | 12 +++---- 6 files changed, 25 insertions(+), 62 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 654254136443d..d99d2908c7c5f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.feature import scala.util.Random -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} @@ -30,7 +30,6 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** - * :: Experimental :: * Params for [[LSH]]. */ @Since("2.1.0") @@ -51,6 +50,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { @Since("2.1.0") final def getOutputDim: Int = $(outputDim) + setDefault(outputDim -> 1, outputCol -> "lshFeatures") + /** * Transform the Schema for LSH * @param schema The schema of the input dataset without [[outputCol]] @@ -63,10 +64,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { } /** - * :: Experimental :: * Model produced by [[LSH]]. */ -@Experimental @Since("2.1.0") private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHParams { self: T => @@ -288,7 +287,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** - * :: Experimental :: * Locality Sensitive Hashing for different metrics space. Support basic transformation with a new * hash column, approximate nearest neighbor search with a dataset and a key, and approximate * similarity join of two datasets. @@ -303,7 +301,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ -@Experimental @Since("2.1.0") private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { self: Estimator[T] => diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 98b1df4cf655b..080dcde5649a1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -21,30 +21,10 @@ import scala.util.Random import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{BooleanParam, Params} +import org.apache.spark.ml.param.shared.HasSeed import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.types.StructType -/** - * :: Experimental :: - * Params for [[MinHash]]. - */ -@Since("2.1.0") -private[ml] trait MinHashParams extends Params { - - /** - * If true, set the random seed to 0. Otherwise, use default setting in scala.util.Random - * @group param - */ - @Since("2.1.0") - val hasSeed: BooleanParam = new BooleanParam(this, "hasSeed", - "If true, set the random seed to 0.") - - /** @group getParam */ - @Since("2.1.0") - final def getHasSeed: Boolean = $(hasSeed) -} - /** * :: Experimental :: * Model produced by [[MinHash]] @@ -94,7 +74,7 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int */ @Experimental @Since("2.1.0") -class MinHash(override val uid: String) extends LSH[MinHashModel] with MinHashParams { +class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { // A large prime smaller than sqrt(2^63 − 1) private[this] val prime = 2038074743 @@ -113,18 +93,17 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with MinHashPa this(Identifiable.randomUID("min hash")) } - setDefault(outputDim -> 1, outputCol -> "lshFeatures", hasSeed -> false) - + /** @group setParam */ @Since("2.1.0") - def setHasSeed(value: Boolean): this.type = set(hasSeed, value) + def setSeed(value: Long): this.type = set(seed, value) @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { require(inputDim <= prime / 2, "The input vector dimension is too large for MinHash to handle.") - if ($(hasSeed)) Random.setSeed(0) + val rand = new Random($(seed)) val numEntry = inputDim * 2 val randSeq: Seq[Int] = { - Seq.fill($(outputDim))(1 + Random.nextInt(prime - 1)) + Seq.fill($(outputDim))(1 + rand.nextInt(prime - 1)) } val hashFunctions: Seq[Int => Long] = { randSeq.map { randCoefficient: Int => diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 7206d3f8fa510..be5d3c40746d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -23,7 +23,8 @@ import breeze.linalg.normalize import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{BooleanParam, DoubleParam, Params, ParamValidators} +import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} +import org.apache.spark.ml.param.shared.HasSeed import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.types.StructType @@ -46,18 +47,6 @@ private[ml] trait RandomProjectionParams extends Params { "the length of each hash bucket, a larger bucket lowers the false negative rate.", ParamValidators.gt(0)) - /** - * If true, set the random seed to 0. Otherwise, use default setting in scala.util.Random - * @group param - */ - @Since("2.1.0") - val hasSeed: BooleanParam = new BooleanParam(this, "hasSeed", - "If true, set the random seed to 0.") - - /** @group getParam */ - @Since("2.1.0") - final def getHasSeed: Boolean = $(hasSeed) - /** @group getParam */ @Since("2.1.0") final def getBucketLength: Double = $(bucketLength) @@ -113,7 +102,7 @@ class RandomProjectionModel private[ml] ( @Experimental @Since("2.1.0") class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel] - with RandomProjectionParams { + with RandomProjectionParams with HasSeed { @Since("2.1.0") override def setInputCol(value: String): this.type = super.setInputCol(value) @@ -129,22 +118,20 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod this(Identifiable.randomUID("random projection")) } - setDefault(outputDim -> 1, outputCol -> "lshFeatures", hasSeed -> false) - /** @group setParam */ @Since("2.1.0") def setBucketLength(value: Double): this.type = set(bucketLength, value) /** @group setParam */ @Since("2.1.0") - def setHasSeed(value: Boolean): this.type = set(hasSeed, value) + def setSeed(value: Long): this.type = set(seed, value) @Since("2.1.0") override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = { - if ($(hasSeed)) Random.setSeed(0) + val rand = new Random($(seed)) val randUnitVectors: Array[Vector] = { Array.fill($(outputDim)) { - val randArray = Array.fill(inputDim)(Random.nextGaussian()) + val randArray = Array.fill(inputDim)(rand.nextGaussian()) Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index b3a32b4aaa76f..bc1ea0a16de40 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -38,7 +38,7 @@ private[ml] object LSHTest { * This method hashes each elements to hash buckets using LSH, and calculate the false positive * and false negative: * False positive: Of all (e1, e2) sharing any bucket, the probability of dist(e1, e2) > distFP - * False positive: Of all (e1, e2) not sharing buckets, the probability of dist(e1, e2) < distFN + * False negative: Of all (e1, e2) not sharing buckets, the probability of dist(e1, e2) < distFN * * @param dataset The dataset to verify the locality sensitive hashing property. * @param lsh The lsh instance to perform the hashing diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 93a194f64cb4d..c706ff78c9456 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -32,7 +32,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") - .setHasSeed(true) + .setSeed(0) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) assert(falsePositive < 0.03) @@ -49,7 +49,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setHasSeed(true) + .setSeed(0) val key: Vector = Vectors.sparse(100, (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) @@ -75,7 +75,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setHasSeed(true) + .setSeed(0) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) assert(precision == 1.0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index 63c26fad62d1e..0ff255623b216 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -37,7 +37,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) - .setHasSeed(true) + .setSeed(0) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 8.0, 2.0) assert(falsePositive < 0.07) @@ -58,7 +58,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(2.5) - .setHasSeed(true) + .setSeed(0) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0) assert(falsePositive == 0.0) @@ -77,7 +77,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setHasSeed(true) + .setSeed(0) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = true) @@ -97,7 +97,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) - .setHasSeed(true) + .setSeed(0) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = false) @@ -121,7 +121,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setHasSeed(true) + .setSeed(0) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) assert(precision == 1.0) @@ -139,7 +139,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setHasSeed(true) + .setSeed(0) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) From fb120afc65fee1badc23d3e502f7196dc1d3c4fe Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Tue, 11 Oct 2016 14:31:23 -0700 Subject: [PATCH 32/45] SignRandomProjection: LSH Classes for cosine distance metrics --- .../ml/feature/SignRandomProjection.scala | 118 ++++++++++++++++++ .../feature/SignRandomProjectionSuite.scala | 82 ++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala new file mode 100644 index 0000000000000..11817b27b883b --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import scala.util.Random + +import breeze.linalg.normalize + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.shared.HasSeed +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.sql.types.StructType + +/** + * :: Experimental :: + * Model produced by [[SignRandomProjection]] + * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function. + */ +@Experimental +@Since("2.1.0") +class SignRandomProjectionModel private[ml] ( + override val uid: String, + val randUnitVectors: Array[Vector]) + extends LSHModel[SignRandomProjectionModel] { + + @Since("2.1.0") + override protected[this] val hashFunction: (Vector) => Vector = { + key: Vector => { + val hashValues: Array[Double] = randUnitVectors.map({ + randUnitVector => Math.signum(BLAS.dot(key, randUnitVector)) + }) + Vectors.dense(hashValues) + } + } + + @Since("2.1.0") + override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { + // 1 - cosine similarity + 1 - BLAS.dot(x, y) / (Vectors.norm(x, 2) * Vectors.norm(y, 2)) + } + + @Since("2.1.0") + override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { + // Since it's generated by hashing, it will be a pair of dense vectors. + x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + } +} + +/** + * :: Experimental :: + * This [[SignRandomProjectionModel]] implements Locality Sensitive Hashing functions for cosine + * distance metrics. + * + * The input is dense or sparse vectors, each of which represents a point in the space. The output + * will be vectors of configurable dimension, taking values from {-1, 1, 0}. Hash value in the same + * dimension is calculated by the same hash function. + * + * References: + * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint + * arXiv:1408.2927 (2014). + */ +@Experimental +@Since("2.1.0") +class SignRandomProjection(override val uid: String) extends LSH[SignRandomProjectionModel] + with HasSeed { + + @Since("2.1.0") + override def setInputCol(value: String): this.type = super.setInputCol(value) + + @Since("2.1.0") + override def setOutputCol(value: String): this.type = super.setOutputCol(value) + + @Since("2.1.0") + override def setOutputDim(value: Int): this.type = super.setOutputDim(value) + + @Since("2.1.0") + def this() = { + this(Identifiable.randomUID("random projection")) + } + + /** @group setParam */ + @Since("2.1.0") + def setSeed(value: Long): this.type = set(seed, value) + + @Since("2.1.0") + override protected[this] def createRawLSHModel(inputDim: Int): SignRandomProjectionModel = { + val rand = new Random($(seed)) + val randUnitVectors: Array[Vector] = { + Array.fill($(outputDim)) { + val randArray = Array.fill(inputDim)(rand.nextGaussian()) + Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) + } + } + new SignRandomProjectionModel(uid, randUnitVectors) + } + + @Since("2.1.0") + override def transformSchema(schema: StructType): StructType = { + SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) + validateAndTransformSchema(schema) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala new file mode 100644 index 0000000000000..a3b89797e99b8 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import breeze.numerics.{cos, sin} +import breeze.numerics.constants.Pi + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class SignRandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { + test("SignRandomProjection") { + val data = { + for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val srp = new SignRandomProjection() + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, srp, 1.6, 0.4) + assert(falsePositive < 0.1) + assert(falseNegative < 0.1) + } + + test("approxNearestNeighbors for cosine distance") { + val data = { + for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + val key = Vectors.dense(1.2, 3.4) + + val mh = new SignRandomProjection() + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 30, + singleProbing = true) + assert(precision >= 0.8) + assert(recall >= 0.8) + } + + test("approxSimilarityJoin for cosine distance") { + val dataA = { + for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) + } + val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") + + val dataB = { + for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i)) + } + val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + + val mh = new SignRandomProjection() + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) + assert(precision == 1.0) + assert(recall >= 0.8) + } +} From 19f6d8927f56f9e67a1d4f6d9a14722392469b5a Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Tue, 11 Oct 2016 16:41:50 -0700 Subject: [PATCH 33/45] Change hashFunctions to Arrays --- .../org/apache/spark/ml/feature/MinHash.scala | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 080dcde5649a1..6e3f617695c19 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType */ @Experimental @Since("2.1.0") -class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int => Long]) +class MinHashModel private[ml] (override val uid: String, hashFunctions: Array[Int => Long]) extends LSHModel[MinHashModel] { @Since("2.1.0") @@ -40,9 +40,7 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Seq[Int elems: Vector => require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") val elemsList = elems.toSparse.indices.toList - Vectors.dense(hashFunctions.map( - func => elemsList.map(func).min.toDouble - ).toArray) + Vectors.dense(hashFunctions.map(func => elemsList.map(func).min.toDouble)) } @Since("2.1.0") @@ -102,11 +100,9 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { require(inputDim <= prime / 2, "The input vector dimension is too large for MinHash to handle.") val rand = new Random($(seed)) val numEntry = inputDim * 2 - val randSeq: Seq[Int] = { - Seq.fill($(outputDim))(1 + rand.nextInt(prime - 1)) - } - val hashFunctions: Seq[Int => Long] = { - randSeq.map { randCoefficient: Int => + val randArray: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(prime - 1)) + val hashFunctions: Array[Int => Long] = { + randArray.map { randCoefficient: Int => // Perfect Hash function, use 2n buckets to reduce collision. elem: Int => (1 + elem) * randCoefficient.toLong % prime % numEntry } From 1b6317396629b9f290a279dd735923c0fc8efd89 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Tue, 11 Oct 2016 23:47:17 -0700 Subject: [PATCH 34/45] BitSampling: LSH Class for Hamming Distance --- .../apache/spark/ml/feature/BitSampling.scala | 163 ++++++++++++++++++ .../org/apache/spark/ml/feature/MinHash.scala | 4 +- .../spark/ml/feature/RandomProjection.scala | 2 +- .../ml/feature/SignRandomProjection.scala | 2 +- .../spark/ml/feature/BitSamplingSuite.scala | 100 +++++++++++ .../feature/SignRandomProjectionSuite.scala | 8 +- 6 files changed, 271 insertions(+), 8 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala new file mode 100644 index 0000000000000..4d768df0defe7 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} +import org.apache.spark.ml.param.shared.HasSeed +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.sql.types.StructType + + +/** + * :: Experimental :: + * Params for [[BitSampling]]. + */ +@Since("2.1.0") +private[ml] trait BitSamplingParams extends Params { + + /** + * The number of sampling bits, a larger sample size lowers the false negative rate. + * @group param + */ + @Since("2.1.0") + val sampleSize: IntParam = new IntParam(this, "sampleSize", + "The number of sampling bits, a larger sample size lowers the false negative rate.", + ParamValidators.inRange(0, 64, false, false)) + + /** @group getParam */ + @Since("2.1.0") + final def getSampleSize: Double = $(sampleSize) +} + + +/** + * :: Experimental :: + * Model produced by [[BitSampling]] + * @param sampleIndices An array of seqs of sample indices. Each seq represents a hash function. + */ +@Experimental +@Since("2.1.0") +class BitSamplingModel private[ml] (override val uid: String, sampleIndices: Array[Seq[Int]]) + extends LSHModel[BitSamplingModel] with BitSamplingParams { + + @Since("2.1.0") + override protected[this] val hashFunction: Vector => Vector = { + key: Vector => + val hashValues: Array[Double] = sampleIndices.map({ indices: Seq[Int] => + val bits = indices.map(key(_)) + // Use a numeric number to represent the hash value. + var hashValue = 0x0L + bits.indices.foreach({ i: Int => + if (bits(i) != 0) hashValue |= 0x1L << i + }) + hashValue.toDouble + }) + Vectors.dense(hashValues) + } + + @Since("2.1.0") + override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { + val xSet = x.toSparse.indices.toSet + val ySet = y.toSparse.indices.toSet + val intersectionSize = xSet.intersect(ySet).size.toDouble + val unionSize = xSet.size + ySet.size - intersectionSize + unionSize - intersectionSize + } + + @Since("2.1.0") + override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { + // Since it's generated by hashing, it will be a pair of dense vectors. + x.toDense.values.zip(y.toDense.values) + .map(pair => java.lang.Long.bitCount(pair._1.toLong ^ pair._2.toLong)).min + } +} + +/** + * :: Experimental :: + * LSH class for Hamming distance. + * + * The input can be dense or sparse vectors, each dimension represents one bit. For example, + * `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])` + * means this is a 10-bit input with value = 1 for index 2, 3, 5; value = 0 for other indices. + * Also, all non-zero values are treated as binary "1" values. + */ +@Experimental +@Since("2.1.0") +class BitSampling(override val uid: String) extends LSH[BitSamplingModel] + with BitSamplingParams with HasSeed { + + @Since("2.1.0") + override def setInputCol(value: String): this.type = super.setInputCol(value) + + @Since("2.1.0") + override def setOutputCol(value: String): this.type = super.setOutputCol(value) + + @Since("2.1.0") + override def setOutputDim(value: Int): this.type = super.setOutputDim(value) + + @Since("2.1.0") + def this() = { + this(Identifiable.randomUID("min hash")) + } + + /** @group setParam */ + @Since("2.1.0") + def setSeed(value: Long): this.type = set(seed, value) + + /** @group setParam */ + @Since("2.1.0") + def setSampleSize(value: Int): this.type = set(sampleSize, value) + + private[this] def sampleWithoutReplacement(rand: Random, n: Int, k: Int): Seq[Int] = { + // Fisher-Yates method for sampling without replacement + var remainingSize = n + // Keep an map, where keys are all used, and values are all unused. + val indexMap: mutable.Map[Int, Int] = mutable.Map.empty + Seq.fill(k)({ + val index = rand.nextInt(remainingSize) + val result = indexMap.getOrElse(index, index) + remainingSize -= 1 + // The index has been used. If it's likely to be chosen in the future, let it point to an + // unused value. + val valueToMove = indexMap.getOrElse(remainingSize, remainingSize) + if (index < valueToMove) indexMap.put(index, valueToMove) + result + }) + } + + @Since("2.1.0") + override protected[this] def createRawLSHModel(inputDim: Int): BitSamplingModel = { + require($(sampleSize) <= inputDim, "sampleSize cannot be larger than the input dimension") + val rand = new Random($(seed)) + val sampleIndices: Array[Seq[Int]] = Array.fill($(outputDim))({ + sampleWithoutReplacement(rand, inputDim, $(sampleSize)) + }) + new BitSamplingModel(uid, sampleIndices) + } + + @Since("2.1.0") + override def transformSchema(schema: StructType): StructType = { + SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) + validateAndTransformSchema(schema) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 6e3f617695c19..0f50ef4a81c7e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types.StructType /** * :: Experimental :: * Model produced by [[MinHash]] - * @param hashFunctions A seq of hash functions, mapping elements to their hash values. + * @param hashFunctions An array of hash functions, mapping elements to their hash values. */ @Experimental @Since("2.1.0") @@ -56,7 +56,7 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Array[I @Since("2.1.0") override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index be5d3c40746d8..d2aa1702bfe60 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -82,7 +82,7 @@ class RandomProjectionModel private[ml] ( @Since("2.1.0") override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala index 11817b27b883b..46167fdec0085 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala @@ -58,7 +58,7 @@ class SignRandomProjectionModel private[ml] ( @Since("2.1.0") override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min + x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala new file mode 100644 index 0000000000000..77d418d9b4994 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class BitSamplingSuite extends SparkFunSuite with MLlibTestSparkContext { + test("BitSampling") { + val data = { + for (i <- 0 to 10) yield Vectors.sparse(10, (0 until i).map((_, 1.0))) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val bs = new BitSampling() + .setSampleSize(3) + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, bs, 5.0, 2.0) + assert(falsePositive < 0.1) + assert(falseNegative < 0.15) + } + + test("BitSampling for max sample size") { + val data = { + for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val bs = new BitSampling() + .setSampleSize(63) + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, bs, 10.0, 5.0) + assert(falsePositive == 0.0) + assert(falseNegative <= 0.07) + } + + test("approxNearestNeighbors for bit sampling") { + val data = { + for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) + } + val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + + val bs = new BitSampling() + .setSampleSize(3) + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val key: Vector = Vectors.sparse(100, (50 until 100).map((_, 1.0))) + + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(bs, df, key, 40, + singleProbing = false) + assert(precision == 1.0) + assert(recall == 1.0) + } + + test("approxSimilarityJoin for bit sampling on different dataset") { + val dataA = { + for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) + } + val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") + + val dataB = { + for (i <- 0 to 100) yield Vectors.sparse(100, (i until 100).map((_, 1.0))) + } + val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + + val bs = new BitSampling() + .setSampleSize(3) + .setInputCol("keys") + .setOutputCol("values") + .setSeed(0) + + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(bs, dfA, dfB, 10.0) + assert(precision == 1.0) + assert(recall == 1.0) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala index a3b89797e99b8..396d641f5a3e3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala @@ -48,12 +48,12 @@ class SignRandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") val key = Vectors.dense(1.2, 3.4) - val mh = new SignRandomProjection() + val srp = new SignRandomProjection() .setInputCol("keys") .setOutputCol("values") .setSeed(0) - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 30, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(srp, df, key, 30, singleProbing = true) assert(precision >= 0.8) assert(recall >= 0.8) @@ -70,12 +70,12 @@ class SignRandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext } val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") - val mh = new SignRandomProjection() + val srp = new SignRandomProjection() .setInputCol("keys") .setOutputCol("values") .setSeed(0) - val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(srp, dfA, dfB, 0.5) assert(precision == 1.0) assert(recall >= 0.8) } From a35e26186a0d069e1c43907e257fa7b4ab31d140 Mon Sep 17 00:00:00 2001 From: Yunni Date: Thu, 13 Oct 2016 02:13:50 -0400 Subject: [PATCH 35/45] Move distinct() before calculating the distance to improve running time --- mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index d99d2908c7c5f..aa9bbd2037291 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -261,7 +261,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP // Do a hash join on where the exploded hash values are equal. val joinedDataset = explodedA.join(explodedB, explodeCols) - .drop(explodeCols: _*) + .drop(explodeCols: _*).distinct() // Add a new column to store the distance of the two records. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) @@ -271,7 +271,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP ) // Filter the joined datasets where the distance are smaller than the threshold. - joinedDatasetWithDist.filter(col(distCol) < threshold).distinct() + joinedDatasetWithDist.filter(col(distCol) < threshold) } /** From 66d553a4e2bd8c219c09e17db11962cd49114a24 Mon Sep 17 00:00:00 2001 From: Yunni Date: Mon, 17 Oct 2016 02:19:02 -0400 Subject: [PATCH 36/45] For similarity join, expose leftCol and rightCol as parameters --- .../org/apache/spark/ml/feature/LSH.scala | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index aa9bbd2037291..21272a4f44a15 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -236,6 +236,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * @param datasetB Another dataset to join * @param threshold The threshold for the distance of record pairs * @param distCol Output column for storing the distance between each result record and the key + * @param leftColName The alias of all columns of datasetA in the output Dataset + * @param rightColName The alias of all columns of datasetB in the output Dataset * @return A joined dataset containing pairs of records. A distCol is added to show the distance * between each pair of records. */ @@ -244,19 +246,20 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double, - distCol: String): Dataset[_] = { + distCol: String, + leftColName: String, + rightColName: String): Dataset[_] = { val explodeCols = Seq("entry", "hashValue") - val inputName = "input" - val explodedA = processDataset(datasetA, inputName, explodeCols) + val explodedA = processDataset(datasetA, leftColName, explodeCols) // If this is a self join, we need to recreate the inputCol of datasetB to avoid ambiguity. // TODO: Remove recreateCol logic once SPARK-17154 is resolved. val explodedB = if (datasetA != datasetB) { - processDataset(datasetB, inputName, explodeCols) + processDataset(datasetB, rightColName, explodeCols) } else { val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}") - processDataset(recreatedB, inputName, explodeCols) + processDataset(recreatedB, rightColName, explodeCols) } // Do a hash join on where the exploded hash values are equal. @@ -266,8 +269,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP // Add a new column to store the distance of the two records. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), - distUDF(explodedA(s"$inputName.${$(inputCol)}"), - explodedB(s"$inputName.${$(inputCol)}")).as(distCol) + distUDF(col(s"$leftColName.${$(inputCol)}"), col(s"$rightColName.${$(inputCol)}")).as(distCol) ) // Filter the joined datasets where the distance are smaller than the threshold. @@ -275,14 +277,15 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** - * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol. + * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol, "leftCol" as + * default leftCol, rightCol as default rightCol */ @Since("2.1.0") def approxSimilarityJoin( datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double): Dataset[_] = { - approxSimilarityJoin(datasetA, datasetB, threshold, "distCol") + approxSimilarityJoin(datasetA, datasetB, threshold, "distCol", "leftCol", "rightCol") } } From cad4ecb3cea47e16b9c1073d30d8fd57bc397621 Mon Sep 17 00:00:00 2001 From: Yunni Date: Sat, 22 Oct 2016 18:49:11 -0400 Subject: [PATCH 37/45] Code Review comments: (1) Save BitSampling and SignRandomProjection for a follow-up PR (2) Use 'datasetA' and 'datasetB' as the default colNames in Similarity Join --- .../apache/spark/ml/feature/BitSampling.scala | 163 ------------------ .../org/apache/spark/ml/feature/LSH.scala | 13 +- .../ml/feature/SignRandomProjection.scala | 118 ------------- .../spark/ml/feature/BitSamplingSuite.scala | 100 ----------- .../feature/SignRandomProjectionSuite.scala | 82 --------- 5 files changed, 5 insertions(+), 471 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala deleted file mode 100644 index 4d768df0defe7..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/BitSampling.scala +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import scala.collection.mutable -import scala.util.Random - -import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{IntParam, Params, ParamValidators} -import org.apache.spark.ml.param.shared.HasSeed -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} -import org.apache.spark.sql.types.StructType - - -/** - * :: Experimental :: - * Params for [[BitSampling]]. - */ -@Since("2.1.0") -private[ml] trait BitSamplingParams extends Params { - - /** - * The number of sampling bits, a larger sample size lowers the false negative rate. - * @group param - */ - @Since("2.1.0") - val sampleSize: IntParam = new IntParam(this, "sampleSize", - "The number of sampling bits, a larger sample size lowers the false negative rate.", - ParamValidators.inRange(0, 64, false, false)) - - /** @group getParam */ - @Since("2.1.0") - final def getSampleSize: Double = $(sampleSize) -} - - -/** - * :: Experimental :: - * Model produced by [[BitSampling]] - * @param sampleIndices An array of seqs of sample indices. Each seq represents a hash function. - */ -@Experimental -@Since("2.1.0") -class BitSamplingModel private[ml] (override val uid: String, sampleIndices: Array[Seq[Int]]) - extends LSHModel[BitSamplingModel] with BitSamplingParams { - - @Since("2.1.0") - override protected[this] val hashFunction: Vector => Vector = { - key: Vector => - val hashValues: Array[Double] = sampleIndices.map({ indices: Seq[Int] => - val bits = indices.map(key(_)) - // Use a numeric number to represent the hash value. - var hashValue = 0x0L - bits.indices.foreach({ i: Int => - if (bits(i) != 0) hashValue |= 0x1L << i - }) - hashValue.toDouble - }) - Vectors.dense(hashValues) - } - - @Since("2.1.0") - override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { - val xSet = x.toSparse.indices.toSet - val ySet = y.toSparse.indices.toSet - val intersectionSize = xSet.intersect(ySet).size.toDouble - val unionSize = xSet.size + ySet.size - intersectionSize - unionSize - intersectionSize - } - - @Since("2.1.0") - override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { - // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values) - .map(pair => java.lang.Long.bitCount(pair._1.toLong ^ pair._2.toLong)).min - } -} - -/** - * :: Experimental :: - * LSH class for Hamming distance. - * - * The input can be dense or sparse vectors, each dimension represents one bit. For example, - * `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])` - * means this is a 10-bit input with value = 1 for index 2, 3, 5; value = 0 for other indices. - * Also, all non-zero values are treated as binary "1" values. - */ -@Experimental -@Since("2.1.0") -class BitSampling(override val uid: String) extends LSH[BitSamplingModel] - with BitSamplingParams with HasSeed { - - @Since("2.1.0") - override def setInputCol(value: String): this.type = super.setInputCol(value) - - @Since("2.1.0") - override def setOutputCol(value: String): this.type = super.setOutputCol(value) - - @Since("2.1.0") - override def setOutputDim(value: Int): this.type = super.setOutputDim(value) - - @Since("2.1.0") - def this() = { - this(Identifiable.randomUID("min hash")) - } - - /** @group setParam */ - @Since("2.1.0") - def setSeed(value: Long): this.type = set(seed, value) - - /** @group setParam */ - @Since("2.1.0") - def setSampleSize(value: Int): this.type = set(sampleSize, value) - - private[this] def sampleWithoutReplacement(rand: Random, n: Int, k: Int): Seq[Int] = { - // Fisher-Yates method for sampling without replacement - var remainingSize = n - // Keep an map, where keys are all used, and values are all unused. - val indexMap: mutable.Map[Int, Int] = mutable.Map.empty - Seq.fill(k)({ - val index = rand.nextInt(remainingSize) - val result = indexMap.getOrElse(index, index) - remainingSize -= 1 - // The index has been used. If it's likely to be chosen in the future, let it point to an - // unused value. - val valueToMove = indexMap.getOrElse(remainingSize, remainingSize) - if (index < valueToMove) indexMap.put(index, valueToMove) - result - }) - } - - @Since("2.1.0") - override protected[this] def createRawLSHModel(inputDim: Int): BitSamplingModel = { - require($(sampleSize) <= inputDim, "sampleSize cannot be larger than the input dimension") - val rand = new Random($(seed)) - val sampleIndices: Array[Seq[Int]] = Array.fill($(outputDim))({ - sampleWithoutReplacement(rand, inputDim, $(sampleSize)) - }) - new BitSamplingModel(uid, sampleIndices) - } - - @Since("2.1.0") - override def transformSchema(schema: StructType): StructType = { - SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) - validateAndTransformSchema(schema) - } -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 21272a4f44a15..cea98035818cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -236,8 +236,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * @param datasetB Another dataset to join * @param threshold The threshold for the distance of record pairs * @param distCol Output column for storing the distance between each result record and the key - * @param leftColName The alias of all columns of datasetA in the output Dataset - * @param rightColName The alias of all columns of datasetB in the output Dataset * @return A joined dataset containing pairs of records. A distCol is added to show the distance * between each pair of records. */ @@ -246,10 +244,10 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double, - distCol: String, - leftColName: String, - rightColName: String): Dataset[_] = { + distCol: String): Dataset[_] = { + val leftColName = "datasetA" + val rightColName = "datasetB" val explodeCols = Seq("entry", "hashValue") val explodedA = processDataset(datasetA, leftColName, explodeCols) @@ -277,15 +275,14 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** - * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol, "leftCol" as - * default leftCol, rightCol as default rightCol + * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol. */ @Since("2.1.0") def approxSimilarityJoin( datasetA: Dataset[_], datasetB: Dataset[_], threshold: Double): Dataset[_] = { - approxSimilarityJoin(datasetA, datasetB, threshold, "distCol", "leftCol", "rightCol") + approxSimilarityJoin(datasetA, datasetB, threshold, "distCol") } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala deleted file mode 100644 index 46167fdec0085..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import scala.util.Random - -import breeze.linalg.normalize - -import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.shared.HasSeed -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} -import org.apache.spark.sql.types.StructType - -/** - * :: Experimental :: - * Model produced by [[SignRandomProjection]] - * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function. - */ -@Experimental -@Since("2.1.0") -class SignRandomProjectionModel private[ml] ( - override val uid: String, - val randUnitVectors: Array[Vector]) - extends LSHModel[SignRandomProjectionModel] { - - @Since("2.1.0") - override protected[this] val hashFunction: (Vector) => Vector = { - key: Vector => { - val hashValues: Array[Double] = randUnitVectors.map({ - randUnitVector => Math.signum(BLAS.dot(key, randUnitVector)) - }) - Vectors.dense(hashValues) - } - } - - @Since("2.1.0") - override protected[ml] def keyDistance(x: Vector, y: Vector): Double = { - // 1 - cosine similarity - 1 - BLAS.dot(x, y) / (Vectors.norm(x, 2) * Vectors.norm(y, 2)) - } - - @Since("2.1.0") - override protected[ml] def hashDistance(x: Vector, y: Vector): Double = { - // Since it's generated by hashing, it will be a pair of dense vectors. - x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min - } -} - -/** - * :: Experimental :: - * This [[SignRandomProjectionModel]] implements Locality Sensitive Hashing functions for cosine - * distance metrics. - * - * The input is dense or sparse vectors, each of which represents a point in the space. The output - * will be vectors of configurable dimension, taking values from {-1, 1, 0}. Hash value in the same - * dimension is calculated by the same hash function. - * - * References: - * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint - * arXiv:1408.2927 (2014). - */ -@Experimental -@Since("2.1.0") -class SignRandomProjection(override val uid: String) extends LSH[SignRandomProjectionModel] - with HasSeed { - - @Since("2.1.0") - override def setInputCol(value: String): this.type = super.setInputCol(value) - - @Since("2.1.0") - override def setOutputCol(value: String): this.type = super.setOutputCol(value) - - @Since("2.1.0") - override def setOutputDim(value: Int): this.type = super.setOutputDim(value) - - @Since("2.1.0") - def this() = { - this(Identifiable.randomUID("random projection")) - } - - /** @group setParam */ - @Since("2.1.0") - def setSeed(value: Long): this.type = set(seed, value) - - @Since("2.1.0") - override protected[this] def createRawLSHModel(inputDim: Int): SignRandomProjectionModel = { - val rand = new Random($(seed)) - val randUnitVectors: Array[Vector] = { - Array.fill($(outputDim)) { - val randArray = Array.fill(inputDim)(rand.nextGaussian()) - Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray))) - } - } - new SignRandomProjectionModel(uid, randUnitVectors) - } - - @Since("2.1.0") - override def transformSchema(schema: StructType): StructType = { - SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) - validateAndTransformSchema(schema) - } -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala deleted file mode 100644 index 77d418d9b4994..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/BitSamplingSuite.scala +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.mllib.util.MLlibTestSparkContext - -class BitSamplingSuite extends SparkFunSuite with MLlibTestSparkContext { - test("BitSampling") { - val data = { - for (i <- 0 to 10) yield Vectors.sparse(10, (0 until i).map((_, 1.0))) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - - val bs = new BitSampling() - .setSampleSize(3) - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, bs, 5.0, 2.0) - assert(falsePositive < 0.1) - assert(falseNegative < 0.15) - } - - test("BitSampling for max sample size") { - val data = { - for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - - val bs = new BitSampling() - .setSampleSize(63) - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, bs, 10.0, 5.0) - assert(falsePositive == 0.0) - assert(falseNegative <= 0.07) - } - - test("approxNearestNeighbors for bit sampling") { - val data = { - for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - - val bs = new BitSampling() - .setSampleSize(3) - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val key: Vector = Vectors.sparse(100, (50 until 100).map((_, 1.0))) - - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(bs, df, key, 40, - singleProbing = false) - assert(precision == 1.0) - assert(recall == 1.0) - } - - test("approxSimilarityJoin for bit sampling on different dataset") { - val dataA = { - for (i <- 0 to 100) yield Vectors.sparse(100, (0 until i).map((_, 1.0))) - } - val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") - - val dataB = { - for (i <- 0 to 100) yield Vectors.sparse(100, (i until 100).map((_, 1.0))) - } - val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") - - val bs = new BitSampling() - .setSampleSize(3) - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(bs, dfA, dfB, 10.0) - assert(precision == 1.0) - assert(recall == 1.0) - } -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala deleted file mode 100644 index 396d641f5a3e3..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.feature - -import breeze.numerics.{cos, sin} -import breeze.numerics.constants.Pi - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.mllib.util.MLlibTestSparkContext - -class SignRandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { - test("SignRandomProjection") { - val data = { - for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - - val srp = new SignRandomProjection() - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, srp, 1.6, 0.4) - assert(falsePositive < 0.1) - assert(falseNegative < 0.1) - } - - test("approxNearestNeighbors for cosine distance") { - val data = { - for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - val key = Vectors.dense(1.2, 3.4) - - val srp = new SignRandomProjection() - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(srp, df, key, 30, - singleProbing = true) - assert(precision >= 0.8) - assert(recall >= 0.8) - } - - test("approxSimilarityJoin for cosine distance") { - val dataA = { - for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) - } - val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") - - val dataB = { - for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i)) - } - val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") - - val srp = new SignRandomProjection() - .setInputCol("keys") - .setOutputCol("values") - .setSeed(0) - - val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(srp, dfA, dfB, 0.5) - assert(precision == 1.0) - assert(recall >= 0.8) - } -} From e14f73e8a49d409e09a6ed541d4b40f07dc81013 Mon Sep 17 00:00:00 2001 From: Yunni Date: Sat, 22 Oct 2016 21:08:13 -0400 Subject: [PATCH 38/45] (1) Reset all random seed != 0 (2) Add docstring about the output schema of Similarity Join (3) Change 'record' -> 'row' for clarity --- .../org/apache/spark/ml/feature/LSH.scala | 16 +++++------ .../spark/ml/feature/MinHashSuite.scala | 12 ++++---- .../ml/feature/RandomProjectionSuite.scala | 28 +++++++++---------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index cea98035818cf..be4beed938975 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -126,9 +126,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * @param key Feature vector representing the item to search for * @param numNearestNeighbors The maximum number of nearest neighbors * @param singleProbing True for using Single Probing; false for multiple probing - * @param distCol Output column for storing the distance between each result record and the key + * @param distCol Output column for storing the distance between each result row and the key * @return A dataset containing at most k items closest to the key. A distCol is added to show - * the distance between each record and the key. + * the distance between each row and the key. */ @Since("2.1.0") def approxNearestNeighbors( @@ -227,17 +227,17 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP } /** - * Join two dataset to approximately find all pairs of records whose distance are smaller than + * Join two dataset to approximately find all pairs of rows whose distance are smaller than * the threshold. If the [[outputCol]] is missing, the method will transform the data; if the * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed * data when necessary. * * @param datasetA One of the datasets to join * @param datasetB Another dataset to join - * @param threshold The threshold for the distance of record pairs - * @param distCol Output column for storing the distance between each result record and the key - * @return A joined dataset containing pairs of records. A distCol is added to show the distance - * between each pair of records. + * @param threshold The threshold for the distance of row pairs + * @param distCol Output column for storing the distance between each result row and the key + * @return A joined dataset containing pairs of rows. The original rows are in columns + * "datasetA" and "datasetB", and a distCol is added to show the distance of each pair */ @Since("2.1.0") def approxSimilarityJoin( @@ -264,7 +264,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP val joinedDataset = explodedA.join(explodedB, explodeCols) .drop(explodeCols: _*).distinct() - // Add a new column to store the distance of the two records. + // Add a new column to store the distance of the two rows. val distUDF = udf((x: Vector, y: Vector) => keyDistance(x, y), DataTypes.DoubleType) val joinedDatasetWithDist = joinedDataset.select(col("*"), distUDF(col(s"$leftColName.${$(inputCol)}"), col(s"$rightColName.${$(inputCol)}")).as(distCol) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index c706ff78c9456..3b2b4304c4469 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -32,10 +32,10 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") - .setSeed(0) + .setSeed(12344) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) - assert(falsePositive < 0.03) + assert(falsePositive < 0.06) assert(falseNegative < 0.01) } @@ -49,15 +49,15 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setSeed(0) + .setSeed(12345) val key: Vector = Vectors.sparse(100, (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 20, singleProbing = true) - assert(precision >= 0.9) - assert(recall >= 0.9) + assert(precision >= 0.95) + assert(recall >= 0.95) } test("approxSimilarityJoin for minhash on different dataset") { @@ -75,7 +75,7 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputDim(20) .setInputCol("keys") .setOutputCol("values") - .setSeed(0) + .setSeed(12345) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) assert(precision == 1.0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index 0ff255623b216..dcc64a62172a0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -37,11 +37,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) - .setSeed(0) + .setSeed(12345) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 8.0, 2.0) - assert(falsePositive < 0.07) - assert(falseNegative < 0.05) + assert(falsePositive < 0.05) + assert(falseNegative < 0.06) } test("RandomProjection with high dimension data") { @@ -58,11 +58,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(2.5) - .setSeed(0) + .setSeed(12345) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0) assert(falsePositive == 0.0) - assert(falseNegative < 0.03) + assert(falseNegative < 0.05) } test("approxNearestNeighbors for random projection") { @@ -77,12 +77,12 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setSeed(0) + .setSeed(12345) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = true) - assert(precision >= 0.7) - assert(recall >= 0.7) + assert(precision >= 0.6) + assert(recall >= 0.6) } test("approxNearestNeighbors with multiple probing") { @@ -97,12 +97,12 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(1.0) - .setSeed(0) + .setSeed(12345) val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, singleProbing = false) - assert(precision >= 0.75) - assert(recall >= 0.75) + assert(precision >= 0.8) + assert(recall >= 0.8) } test("approxSimilarityJoin for random projection on different dataset") { @@ -121,7 +121,7 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setSeed(0) + .setSeed(12345) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) assert(precision == 1.0) @@ -139,10 +139,10 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("keys") .setOutputCol("values") .setBucketLength(4.0) - .setSeed(0) + .setSeed(12345) val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) - assert(recall >= 0.9) + assert(recall >= 0.8) } } From 1c4b9fb6821d5f86037a5f55976a72e85cb2440b Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Wed, 26 Oct 2016 17:10:46 -0700 Subject: [PATCH 39/45] (1) Add readers/writers (2) Change unit tests thresholds to more rebost values (3) Add more units around params, schemas and internal functions --- .../org/apache/spark/ml/feature/LSH.scala | 18 ++- .../org/apache/spark/ml/feature/MinHash.scala | 102 ++++++++++++--- .../spark/ml/feature/RandomProjection.scala | 79 +++++++++++- .../org/apache/spark/ml/feature/LSHTest.scala | 20 ++- .../spark/ml/feature/MinHashSuite.scala | 83 ++++++++++--- .../ml/feature/RandomProjectionSuite.scala | 116 +++++++++++++----- 6 files changed, 331 insertions(+), 87 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index be4beed938975..819f9a460b66b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -22,9 +22,9 @@ import scala.util.Random import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} -import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{IntParam, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @@ -67,18 +67,16 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { * Model produced by [[LSH]]. */ @Since("2.1.0") -private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHParams { +private[ml] abstract class LSHModel[T <: LSHModel[T]] + extends Model[T] with LSHParams with MLWritable { self: T => - @Since("2.1.0") - override def copy(extra: ParamMap): T = defaultCopy(extra) - /** * The hash function of LSH, mapping a predefined KeyType to a Vector * @return The mapping of LSH function. */ @Since("2.1.0") - protected[this] val hashFunction: Vector => Vector + protected[ml] val hashFunction: Vector => Vector /** * Calculate the distance between two different keys using the distance metric corresponding @@ -302,7 +300,8 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHP * arXiv:1408.2927 (2014). */ @Since("2.1.0") -private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams { +private[ml] abstract class LSH[T <: LSHModel[T]] + extends Estimator[T] with LSHParams with DefaultParamsWritable { self: Estimator[T] => /** @group setParam */ @@ -327,9 +326,6 @@ private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHPa @Since("2.1.0") protected[this] def createRawLSHModel(inputDim: Int): T - @Since("2.1.0") - override def copy(extra: ParamMap): Estimator[T] = defaultCopy(extra) - @Since("2.1.0") override def fit(dataset: Dataset[_]): T = { transformSchema(dataset.schema, logging = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 0f50ef4a81c7e..ff38ac26a1473 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -19,28 +19,40 @@ package org.apache.spark.ml.feature import scala.util.Random +import org.apache.hadoop.fs.Path + import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasSeed -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.util._ import org.apache.spark.sql.types.StructType /** * :: Experimental :: * Model produced by [[MinHash]] - * @param hashFunctions An array of hash functions, mapping elements to their hash values. + * @param numEntries The number of entries of the hash functions. + * @param randCoefficients An array of random coefficients, each used by one hash function. */ @Experimental @Since("2.1.0") -class MinHashModel private[ml] (override val uid: String, hashFunctions: Array[Int => Long]) +class MinHashModel private[ml] ( + override val uid: String, + val numEntries: Int, + val randCoefficients: Array[Int]) extends LSHModel[MinHashModel] { @Since("2.1.0") - override protected[this] val hashFunction: Vector => Vector = { + override protected[ml] val hashFunction: Vector => Vector = { elems: Vector => require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.") val elemsList = elems.toSparse.indices.toList - Vectors.dense(hashFunctions.map(func => elemsList.map(func).min.toDouble)) + val hashValues = randCoefficients.map({ randCoefficient: Int => + elemsList.map({elem: Int => + (1 + elem) * randCoefficient.toLong % MinHash.prime % numEntries + }).min.toDouble + }) + Vectors.dense(hashValues) } @Since("2.1.0") @@ -58,6 +70,12 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Array[I // Since it's generated by hashing, it will be a pair of dense vectors. x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min } + + @Since("2.1.0") + override def copy(extra: ParamMap): this.type = defaultCopy(extra) + + @Since("2.1.0") + override def write: MLWriter = new MinHashModel.MinHashModelWriter(this) } /** @@ -69,13 +87,14 @@ class MinHashModel private[ml] (override val uid: String, hashFunctions: Array[I * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5. * Also, any input vector must have at least 1 non-zero indices, and all non-zero values are treated * as binary "1" values. + * + * References: + * https://en.wikipedia.org/wiki/MinHash */ @Experimental @Since("2.1.0") class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { - // A large prime smaller than sqrt(2^63 − 1) - private[this] val prime = 2038074743 @Since("2.1.0") override def setInputCol(value: String): this.type = super.setInputCol(value) @@ -96,18 +115,13 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { def setSeed(value: Long): this.type = set(seed, value) @Since("2.1.0") - override protected[this] def createRawLSHModel(inputDim: Int): MinHashModel = { - require(inputDim <= prime / 2, "The input vector dimension is too large for MinHash to handle.") + override protected[ml] def createRawLSHModel(inputDim: Int): MinHashModel = { + require(inputDim <= MinHash.prime / 2, + "The input vector dimension is too large for MinHash to handle.") val rand = new Random($(seed)) val numEntry = inputDim * 2 - val randArray: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(prime - 1)) - val hashFunctions: Array[Int => Long] = { - randArray.map { randCoefficient: Int => - // Perfect Hash function, use 2n buckets to reduce collision. - elem: Int => (1 + elem) * randCoefficient.toLong % prime % numEntry - } - } - new MinHashModel(uid, hashFunctions) + val randCoofs: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(MinHash.prime - 1)) + new MinHashModel(uid, numEntry, randCoofs) } @Since("2.1.0") @@ -115,4 +129,58 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) validateAndTransformSchema(schema) } + + @Since("2.1.0") + override def copy(extra: ParamMap): this.type = defaultCopy(extra) +} + +@Since("2.1.0") +object MinHash extends DefaultParamsReadable[MinHash] { + // A large prime smaller than sqrt(2^63 − 1) + private[ml] val prime = 2038074743 + + @Since("2.1.0") + override def load(path: String): MinHash = super.load(path) +} + +@Since("2.1.0") +object MinHashModel extends MLReadable[MinHashModel] { + + @Since("2.1.0") + override def read: MLReader[MinHashModel] = new MinHashModelReader + + @Since("2.1.0") + override def load(path: String): MinHashModel = super.load(path) + + private[MinHashModel] class MinHashModelWriter(instance: MinHashModel) extends MLWriter { + + private case class Data(numEntries: Int, randCoefficients: Array[Int]) + + override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + // Save model data: pi, theta + val data = Data(instance.numEntries, instance.randCoefficients) + val dataPath = new Path(path, "data").toString + sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + } + } + + private class MinHashModelReader extends MLReader[MinHashModel] { + + /** Checked against metadata when loading model */ + private val className = classOf[MinHashModel].getName + + override def load(path: String): MinHashModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + + val dataPath = new Path(path, "data").toString + val data = sparkSession.read.parquet(dataPath).select("numEntries", "randCoefficients").head() + val numEntries = data.getAs[Int](0) + val randCoefficients = data.getAs[Seq[Int]](1).toArray + val model = new MinHashModel(metadata.uid, numEntries, randCoefficients) + + DefaultParamsReader.getAndSetParams(model, metadata) + model + } + } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index d2aa1702bfe60..a34527988dd21 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -20,12 +20,15 @@ package org.apache.spark.ml.feature import scala.util.Random import breeze.linalg.normalize +import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{DoubleParam, Params, ParamValidators} +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasSeed -import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType /** @@ -65,7 +68,7 @@ class RandomProjectionModel private[ml] ( extends LSHModel[RandomProjectionModel] with RandomProjectionParams { @Since("2.1.0") - override protected[this] val hashFunction: (Vector) => Vector = { + override protected[ml] val hashFunction: (Vector) => Vector = { key: Vector => { val hashValues: Array[Double] = randUnitVectors.map({ randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength)) @@ -84,6 +87,12 @@ class RandomProjectionModel private[ml] ( // Since it's generated by hashing, it will be a pair of dense vectors. x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min } + + @Since("2.1.0") + override def copy(extra: ParamMap): this.type = defaultCopy(extra) + + @Since("2.1.0") + override def write: MLWriter = new RandomProjectionModel.RandomProjectionModelWriter(this) } /** @@ -96,7 +105,8 @@ class RandomProjectionModel private[ml] ( * dimension is calculated by the same hash function. * * References: - * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint + * 1. https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions + * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ @Experimental @@ -143,4 +153,63 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT) validateAndTransformSchema(schema) } + + @Since("2.1.0") + override def copy(extra: ParamMap): this.type = defaultCopy(extra) +} + +@Since("2.1.0") +object RandomProjection extends DefaultParamsReadable[RandomProjection] { + + @Since("2.1.0") + override def load(path: String): RandomProjection = super.load(path) +} + +@Since("2.1.0") +object RandomProjectionModel extends MLReadable[RandomProjectionModel] { + + @Since("2.1.0") + override def read: MLReader[RandomProjectionModel] = new RandomProjectionModelReader + + @Since("2.1.0") + override def load(path: String): RandomProjectionModel = super.load(path) + + private[RandomProjectionModel] class RandomProjectionModelWriter(instance: RandomProjectionModel) + extends MLWriter { + + private case class Data(randUnitVectors: Matrix) + + override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + // Save model data: pi, theta + val numRows = instance.randUnitVectors.length + require(numRows > 0) + val numCols = instance.randUnitVectors.head.size + val values = instance.randUnitVectors.map(_.toArray).reduce(Array.concat(_, _)) + val randMatrix = Matrices.dense(numRows, numCols, values) + val data = Data(randMatrix) + val dataPath = new Path(path, "data").toString + sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + } + } + + private class RandomProjectionModelReader extends MLReader[RandomProjectionModel] { + + /** Checked against metadata when loading model */ + private val className = classOf[RandomProjectionModel].getName + + override def load(path: String): RandomProjectionModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + + val dataPath = new Path(path, "data").toString + val data = sparkSession.read.parquet(dataPath) + val Row(randUnitVectors: Matrix) = MLUtils.convertMatrixColumnsToML(data, "randUnitVectors") + .select("randUnitVectors") + .head() + val model = new RandomProjectionModel(metadata.uid, randUnitVectors.rowIter.toArray) + + DefaultParamsReader.getAndSetParams(model, metadata) + model + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index bc1ea0a16de40..5c025546f332b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -17,7 +17,8 @@ package org.apache.spark.ml.feature -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{Vector, VectorUDT} +import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DataTypes @@ -57,6 +58,8 @@ private[ml] object LSHTest { val outputCol = model.getOutputCol val transformedData = model.transform(dataset) + SchemaUtils.checkColumnType(transformedData.schema, model.getOutputCol, new VectorUDT) + // Perform a cross join and label each pair of same_bucket and distance val pairs = transformedData.as("a").crossJoin(transformedData.as("b")) val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType) @@ -98,6 +101,15 @@ private[ml] object LSHTest { // Compute actual val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol") + assert(actual.schema.sameType(model + .transformSchema(dataset.schema) + .add("distCol", DataTypes.DoubleType)) + ) + + if (!singleProbing) { + assert(actual.count() == k) + } + // Compute precision and recall val correctCount = expected.join(actual, model.getInputCol).count().toDouble (correctCount / actual.count(), correctCount / expected.count()) @@ -128,6 +140,12 @@ private[ml] object LSHTest { // Compute actual val actual = model.approxSimilarityJoin(datasetA, datasetB, threshold) + SchemaUtils.checkColumnType(actual.schema, "distCol", DataTypes.DoubleType) + assert(actual.schema.apply("datasetA").dataType + .sameType(model.transformSchema(datasetA.schema))) + assert(actual.schema.apply("datasetB").dataType + .sameType(model.transformSchema(datasetB.schema))) + // Compute precision and recall val correctCount = actual.filter(col("distCol") < threshold).count().toDouble (correctCount / actual.count(), correctCount / expected.count()) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 3b2b4304c4469..1aeef29863467 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -19,32 +19,75 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.Dataset + +class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + @transient var dataset: Dataset[_] = _ + + override def beforeAll(): Unit = { + super.beforeAll() -class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { - test("MinHash") { val data = { for (i <- 0 to 95) yield Vectors.sparse(100, (i until i + 5).map((_, 1.0))) } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + dataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + } + + test("params") { + ParamsSuite.checkParams(new MinHash) + val model = new MinHashModel("mh", numEntries = 2, randCoefficients = Array(1)) + ParamsSuite.checkParams(model) + } + + test("MinHash: default params") { + val rp = new MinHash + assert(rp.getOutputDim === 1.0) + assert(rp.getOutputCol === "lshFeatures") + } + + test("read/write") { + def checkModelData(model: MinHashModel, model2: MinHashModel): Unit = { + assert(model.numEntries === model2.numEntries) + assertResult(model.randCoefficients)(model2.randCoefficients) + } + val mh = new MinHash() + val settings = Map("inputCol" -> "keys", "outputCol" -> "values") + testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData) + } + + test("hashFunction") { + val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3)) + val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0)))) + assert(res.equals(Vectors.dense(0.0, 3.0, 4.0))) + } + + test("keyDistance and hashDistance") { + val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(1)) + val v1 = Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))) + val v2 = Vectors.sparse(10, Seq((1, 1.0), (3, 1.0), (5, 1.0), (7, 1.0), (9, 1.0))) + val keyDist = model.keyDistance(v1, v2) + val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2)) + assert(keyDist === 0.5) + assert(hashDist === 3) + } + test("MinHash: test of LSH property") { val mh = new MinHash() .setOutputDim(1) .setInputCol("keys") .setOutputCol("values") .setSeed(12344) - val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, mh, 0.75, 0.5) - assert(falsePositive < 0.06) - assert(falseNegative < 0.01) + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, mh, 0.75, 0.5) + assert(falsePositive < 0.3) + assert(falseNegative < 0.3) } test("approxNearestNeighbors for min hash") { - val data = { - for (i <- 0 to 95) yield Vectors.sparse(100, (i until i + 5).map((_, 1.0))) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") - val mh = new MinHash() .setOutputDim(20) .setInputCol("keys") @@ -54,22 +97,22 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { val key: Vector = Vectors.sparse(100, (0 until 100).filter(_.toString.contains("1")).map((_, 1.0))) - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 20, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, dataset, key, 20, singleProbing = true) - assert(precision >= 0.95) - assert(recall >= 0.95) + assert(precision >= 0.7) + assert(recall >= 0.7) } test("approxSimilarityJoin for minhash on different dataset") { - val dataA = { + val data1 = { for (i <- 0 until 20) yield Vectors.sparse(100, (5 * i until 5 * i + 5).map((_, 1.0))) } - val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") + val df1 = spark.createDataFrame(data1.map(Tuple1.apply)).toDF("keys") - val dataB = { + val data2 = { for (i <- 0 until 30) yield Vectors.sparse(100, (3 * i until 3 * i + 3).map((_, 1.0))) } - val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys") val mh = new MinHash() .setOutputDim(20) @@ -77,8 +120,8 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext { .setOutputCol("values") .setSeed(12345) - val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, df1, df2, 0.5) assert(precision == 1.0) - assert(recall == 1.0) + assert(recall >= 0.7) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index dcc64a62172a0..dc2f922cd3a07 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -21,16 +21,79 @@ import breeze.numerics.{cos, sin} import breeze.numerics.constants.Pi import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.Dataset + +class RandomProjectionSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + @transient var dataset: Dataset[_] = _ + + override def beforeAll(): Unit = { + super.beforeAll() -class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { - test("RandomProjection") { val data = { - for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble) + for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + dataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") + } + + test("params") { + ParamsSuite.checkParams(new RandomProjection) + val model = new RandomProjectionModel("rp", randUnitVectors = Array(Vectors.dense(1.0, 0.0))) + ParamsSuite.checkParams(model) + } + + test("RandomProjection: default params") { + val rp = new RandomProjection + assert(rp.getOutputDim === 1.0) + assert(rp.getOutputCol === "lshFeatures") + } + test("read/write") { + def checkModelData(model: RandomProjectionModel, model2: RandomProjectionModel): Unit = { + model.randUnitVectors.zip(model2.randUnitVectors) + .foreach(pair => assert(pair._1 === pair._2)) + } + val mh = new RandomProjection() + val settings = Map("inputCol" -> "keys", "outputCol" -> "values", "bucketLength" -> 1.0) + testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData) + } + + test("hashFunction") { + val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0)) + val model = new RandomProjectionModel("rp", randUnitVectors) + model.set(model.bucketLength, 0.5) + val res = model.hashFunction(Vectors.dense(1.23, 4.56)) + assert(res.equals(Vectors.dense(9.0, 2.0))) + } + + test("keyDistance and hashDistance") { + val model = new RandomProjectionModel("rp", Array(Vectors.dense(0.0, 1.0))) + val keyDist = model.keyDistance(Vectors.dense(1, 2), Vectors.dense(-2, -2)) + val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2)) + assert(keyDist === 5) + assert(hashDist === 3) + } + + test("RandomProjection: randUnitVectors") { + val rp = new RandomProjection() + .setOutputDim(20) + .setInputCol("keys") + .setOutputCol("values") + .setBucketLength(1.0) + .setSeed(12345) + val unitVectors = rp.fit(dataset).randUnitVectors + unitVectors.foreach { v: Vector => + assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14) + } + } + + test("RandomProjection: test of LSH property") { // Project from 2 dimensional Euclidean Space to 1 dimensions val rp = new RandomProjection() .setOutputDim(1) @@ -39,12 +102,12 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setBucketLength(1.0) .setSeed(12345) - val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 8.0, 2.0) - assert(falsePositive < 0.05) - assert(falseNegative < 0.06) + val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, rp, 8.0, 2.0) + assert(falsePositive < 0.4) + assert(falseNegative < 0.4) } - test("RandomProjection with high dimension data") { + test("RandomProjection with high dimension data: test of LSH property") { val numDim = 100 val data = { for (i <- 0 until numDim; j <- Seq(-2, -1, 1, 2)) @@ -61,15 +124,11 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setSeed(12345) val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0) - assert(falsePositive == 0.0) - assert(falseNegative < 0.05) + assert(falsePositive < 0.3) + assert(falseNegative < 0.3) } test("approxNearestNeighbors for random projection") { - val data = { - for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") val key = Vectors.dense(1.2, 3.4) val rp = new RandomProjection() @@ -79,17 +138,13 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setBucketLength(4.0) .setSeed(12345) - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100, singleProbing = true) assert(precision >= 0.6) assert(recall >= 0.6) } test("approxNearestNeighbors with multiple probing") { - val data = { - for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) - } - val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys") val key = Vectors.dense(1.2, 3.4) val rp = new RandomProjection() @@ -99,22 +154,17 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setBucketLength(1.0) .setSeed(12345) - val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, df, key, 100, + val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100, singleProbing = false) - assert(precision >= 0.8) - assert(recall >= 0.8) + assert(precision >= 0.7) + assert(recall >= 0.7) } test("approxSimilarityJoin for random projection on different dataset") { - val dataA = { - for (i <- -10 until 10; j <- -10 until 10) yield Vectors.dense(i.toDouble, j.toDouble) - } - val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys") - - val dataB = { + val data2 = { for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i)) } - val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys") + val dataset2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys") val rp = new RandomProjection() .setOutputDim(2) @@ -123,9 +173,9 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { .setBucketLength(4.0) .setSeed(12345) - val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dfA, dfB, 1.0) + val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dataset, dataset2, 1.0) assert(precision == 1.0) - assert(recall >= 0.95) + assert(recall >= 0.7) } test("approxSimilarityJoin for self join") { @@ -143,6 +193,6 @@ class RandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext { val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0) assert(precision == 1.0) - assert(recall >= 0.8) + assert(recall >= 0.7) } } From 20a9ebf03d9bd1d32ea46454352a2ae5500ad5ea Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Thu, 27 Oct 2016 13:17:39 -0700 Subject: [PATCH 40/45] Change a few Since annotations --- mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala | 3 --- .../src/main/scala/org/apache/spark/ml/feature/MinHash.scala | 4 ++-- .../scala/org/apache/spark/ml/feature/RandomProjection.scala | 3 +-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 819f9a460b66b..9a07fffa9d23c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -32,7 +32,6 @@ import org.apache.spark.sql.types._ /** * Params for [[LSH]]. */ -@Since("2.1.0") private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Param for the dimension of LSH OR-amplification. @@ -66,7 +65,6 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** * Model produced by [[LSH]]. */ -@Since("2.1.0") private[ml] abstract class LSHModel[T <: LSHModel[T]] extends Model[T] with LSHParams with MLWritable { self: T => @@ -299,7 +297,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * (2) Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ -@Since("2.1.0") private[ml] abstract class LSH[T <: LSHModel[T]] extends Estimator[T] with LSHParams with DefaultParamsWritable { self: Estimator[T] => diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index ff38ac26a1473..b28e3169c97e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -38,8 +38,8 @@ import org.apache.spark.sql.types.StructType @Since("2.1.0") class MinHashModel private[ml] ( override val uid: String, - val numEntries: Int, - val randCoefficients: Array[Int]) + @Since("2.1.0") val numEntries: Int, + @Since("2.1.0") val randCoefficients: Array[Int]) extends LSHModel[MinHashModel] { @Since("2.1.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index a34527988dd21..eab530ecafe85 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -35,7 +35,6 @@ import org.apache.spark.sql.types.StructType * :: Experimental :: * Params for [[RandomProjection]]. */ -@Since("2.1.0") private[ml] trait RandomProjectionParams extends Params { /** @@ -64,7 +63,7 @@ private[ml] trait RandomProjectionParams extends Params { @Since("2.1.0") class RandomProjectionModel private[ml] ( override val uid: String, - val randUnitVectors: Array[Vector]) + @Since("2.1.0") val randUnitVectors: Array[Vector]) extends LSHModel[RandomProjectionModel] with RandomProjectionParams { @Since("2.1.0") From 9bb3fd607519d245f72afedf95def63e0e7400a7 Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Thu, 27 Oct 2016 14:11:04 -0700 Subject: [PATCH 41/45] Code Review Comments: (1) Remove all Since in LSH (2) Add doc on hash functions in Min Hash --- .../org/apache/spark/ml/feature/LSH.scala | 20 ------------------- .../org/apache/spark/ml/feature/MinHash.scala | 12 ++++++++--- .../spark/ml/feature/RandomProjection.scala | 3 --- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 9a07fffa9d23c..9523d3f6dba4e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -19,7 +19,6 @@ package org.apache.spark.ml.feature import scala.util.Random -import org.apache.spark.annotation.Since import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{IntParam, ParamValidators} @@ -40,13 +39,11 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { * higher the dimension is, the lower the false negative rate. * @group param */ - @Since("2.1.0") final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" + "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" + " improves the running performance", ParamValidators.gt(0)) /** @group getParam */ - @Since("2.1.0") final def getOutputDim: Int = $(outputDim) setDefault(outputDim -> 1, outputCol -> "lshFeatures") @@ -56,7 +53,6 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { * @param schema The schema of the input dataset without [[outputCol]] * @return A derived schema with [[outputCol]] added */ - @Since("2.1.0") protected[this] final def validateAndTransformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT) } @@ -73,7 +69,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * The hash function of LSH, mapping a predefined KeyType to a Vector * @return The mapping of LSH function. */ - @Since("2.1.0") protected[ml] val hashFunction: Vector => Vector /** @@ -83,7 +78,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @param y One input vector in the metric space * @return The distance between x and y */ - @Since("2.1.0") protected[ml] def keyDistance(x: Vector, y: Vector): Double /** @@ -93,17 +87,14 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @param y Another hash vector * @return The distance between hash vectors x and y */ - @Since("2.1.0") protected[ml] def hashDistance(x: Vector, y: Vector): Double - @Since("2.1.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(hashFunction, new VectorUDT) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } - @Since("2.1.0") override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } @@ -126,7 +117,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @return A dataset containing at most k items closest to the key. A distCol is added to show * the distance between each row and the key. */ - @Since("2.1.0") def approxNearestNeighbors( dataset: Dataset[_], key: Vector, @@ -168,7 +158,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search * nearest neighbors and "distCol" as default distCol. */ - @Since("2.1.0") def approxNearestNeighbors( dataset: Dataset[_], key: Vector, @@ -185,7 +174,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @param explodeCols The alias for the exploded columns, must be a seq of two strings. * @return A dataset containing idCol, inputCol and explodeCols */ - @Since("2.1.0") private[this] def processDataset( dataset: Dataset[_], inputName: String, @@ -211,7 +199,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @param tmpColName A temporary column name which does not conflict with existing columns * @return */ - @Since("2.1.0") private[this] def recreateCol( dataset: Dataset[_], colName: String, @@ -235,7 +222,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] * @return A joined dataset containing pairs of rows. The original rows are in columns * "datasetA" and "datasetB", and a distCol is added to show the distance of each pair */ - @Since("2.1.0") def approxSimilarityJoin( datasetA: Dataset[_], datasetB: Dataset[_], @@ -273,7 +259,6 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]] /** * Overloaded method for approxSimilarityJoin. Use "distCol" as default distCol. */ - @Since("2.1.0") def approxSimilarityJoin( datasetA: Dataset[_], datasetB: Dataset[_], @@ -302,15 +287,12 @@ private[ml] abstract class LSH[T <: LSHModel[T]] self: Estimator[T] => /** @group setParam */ - @Since("2.1.0") def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ - @Since("2.1.0") def setOutputCol(value: String): this.type = set(outputCol, value) /** @group setParam */ - @Since("2.1.0") def setOutputDim(value: Int): this.type = set(outputDim, value) /** @@ -320,10 +302,8 @@ private[ml] abstract class LSH[T <: LSHModel[T]] * @param inputDim The dimension of the input dataset * @return A new LSHModel instance without any params */ - @Since("2.1.0") protected[this] def createRawLSHModel(inputDim: Int): T - @Since("2.1.0") override def fit(dataset: Dataset[_]): T = { transformSchema(dataset.schema, logging = true) val inputDim = dataset.select(col($(inputCol))).head().get(0).asInstanceOf[Vector].size diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index b28e3169c97e0..485ba8f80bfb3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -30,7 +30,14 @@ import org.apache.spark.sql.types.StructType /** * :: Experimental :: - * Model produced by [[MinHash]] + * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is + * a perfect hash function: + * g_i(x) = (x * k_i mod prime) mod numEntries + * where c_i is the i-th coefficient + * + * Reference: + * https://en.wikipedia.org/wiki/Perfect_hash_function + * * @param numEntries The number of entries of the hash functions. * @param randCoefficients An array of random coefficients, each used by one hash function. */ @@ -117,7 +124,7 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed { @Since("2.1.0") override protected[ml] def createRawLSHModel(inputDim: Int): MinHashModel = { require(inputDim <= MinHash.prime / 2, - "The input vector dimension is too large for MinHash to handle.") + s"The input vector dimension $inputDim exceeds the threshold ${MinHash.prime / 2}.") val rand = new Random($(seed)) val numEntry = inputDim * 2 val randCoofs: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(MinHash.prime - 1)) @@ -158,7 +165,6 @@ object MinHashModel extends MLReadable[MinHashModel] { override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - // Save model data: pi, theta val data = Data(instance.numEntries, instance.randCoefficients) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index eab530ecafe85..6e7fd3a6431cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -44,13 +44,11 @@ private[ml] trait RandomProjectionParams extends Params { * reasonable value * @group param */ - @Since("2.1.0") val bucketLength: DoubleParam = new DoubleParam(this, "bucketLength", "the length of each hash bucket, a larger bucket lowers the false negative rate.", ParamValidators.gt(0)) /** @group getParam */ - @Since("2.1.0") final def getBucketLength: Double = $(bucketLength) } @@ -180,7 +178,6 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] { override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - // Save model data: pi, theta val numRows = instance.randUnitVectors.length require(numRows > 0) val numCols = instance.randUnitVectors.head.size From 9a3704c6252c842c750c8cf98b0271ab51e3d44e Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Thu, 27 Oct 2016 15:56:23 -0700 Subject: [PATCH 42/45] Organize the scaladoc --- .../scala/org/apache/spark/ml/feature/MinHash.scala | 10 ++++++---- .../org/apache/spark/ml/feature/RandomProjection.scala | 10 +++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index 485ba8f80bfb3..d17a0c57c3a43 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -30,13 +30,14 @@ import org.apache.spark.sql.types.StructType /** * :: Experimental :: + * * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is * a perfect hash function: - * g_i(x) = (x * k_i mod prime) mod numEntries - * where c_i is the i-th coefficient + * `g_i(x) = (x * k_i mod prime) mod numEntries` + * where `k_i` is the i-th coefficient * * Reference: - * https://en.wikipedia.org/wiki/Perfect_hash_function + * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]] * * @param numEntries The number of entries of the hash functions. * @param randCoefficients An array of random coefficients, each used by one hash function. @@ -87,6 +88,7 @@ class MinHashModel private[ml] ( /** * :: Experimental :: + * * LSH class for Jaccard distance. * * The input can be dense or sparse vectors, but it is more efficient if it is sparse. For example, @@ -96,7 +98,7 @@ class MinHashModel private[ml] ( * as binary "1" values. * * References: - * https://en.wikipedia.org/wiki/MinHash + * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]] */ @Experimental @Since("2.1.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 6e7fd3a6431cf..78876140677aa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.types.StructType /** * :: Experimental :: + * * Params for [[RandomProjection]]. */ private[ml] trait RandomProjectionParams extends Params { @@ -40,6 +41,7 @@ private[ml] trait RandomProjectionParams extends Params { /** * The length of each hash bucket, a larger bucket lowers the false negative rate. * + * * If input vectors are normalized, 1-10 times of pow(numRecords, -1/inputDim) would be a * reasonable value * @group param @@ -54,6 +56,7 @@ private[ml] trait RandomProjectionParams extends Params { /** * :: Experimental :: + * * Model produced by [[RandomProjection]] * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function. */ @@ -94,6 +97,7 @@ class RandomProjectionModel private[ml] ( /** * :: Experimental :: + * * This [[RandomProjection]] implements Locality Sensitive Hashing functions for Euclidean * distance metrics. * @@ -102,7 +106,10 @@ class RandomProjectionModel private[ml] ( * dimension is calculated by the same hash function. * * References: - * 1. https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions + * + * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions + * Wikipedia on Stable Distributions]] + * * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint * arXiv:1408.2927 (2014). */ @@ -174,6 +181,7 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] { private[RandomProjectionModel] class RandomProjectionModelWriter(instance: RandomProjectionModel) extends MLWriter { + // TODO: Save using the existing format of Array[Vector] once SPARK-12878 is resolved. private case class Data(randUnitVectors: Matrix) override protected def saveImpl(path: String): Unit = { From 6cda936cf2c14f3e4c0e164b0d688fd4c8996b5d Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Thu, 27 Oct 2016 18:34:03 -0700 Subject: [PATCH 43/45] Remove default values for outputCol --- mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala index 9523d3f6dba4e..333a8c364a884 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala @@ -46,7 +46,7 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol { /** @group getParam */ final def getOutputDim: Int = $(outputDim) - setDefault(outputDim -> 1, outputCol -> "lshFeatures") + setDefault(outputDim -> 1) /** * Transform the Schema for LSH From 97e1238ddf14938539237facf354e0ce4fc4ed1c Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Thu, 27 Oct 2016 19:26:28 -0700 Subject: [PATCH 44/45] Remove default values for outputCol --- .../test/scala/org/apache/spark/ml/feature/MinHashSuite.scala | 1 - .../org/apache/spark/ml/feature/RandomProjectionSuite.scala | 1 - 2 files changed, 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala index 1aeef29863467..c32ca7d69cf84 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala @@ -46,7 +46,6 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("MinHash: default params") { val rp = new MinHash assert(rp.getOutputDim === 1.0) - assert(rp.getOutputCol === "lshFeatures") } test("read/write") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala index dc2f922cd3a07..cd82ee2117a07 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala @@ -51,7 +51,6 @@ class RandomProjectionSuite test("RandomProjection: default params") { val rp = new RandomProjection assert(rp.getOutputDim === 1.0) - assert(rp.getOutputCol === "lshFeatures") } test("read/write") { From 35708458a0ee156c097ca604efeafaa37d3c8a6d Mon Sep 17 00:00:00 2001 From: Yun Ni Date: Fri, 28 Oct 2016 13:38:38 -0700 Subject: [PATCH 45/45] Add more scaladoc --- .../scala/org/apache/spark/ml/feature/MinHash.scala | 4 ++-- .../org/apache/spark/ml/feature/RandomProjection.scala | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala index d17a0c57c3a43..d9d0f32254e24 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala @@ -33,8 +33,8 @@ import org.apache.spark.sql.types.StructType * * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is * a perfect hash function: - * `g_i(x) = (x * k_i mod prime) mod numEntries` - * where `k_i` is the i-th coefficient + * `h_i(x) = (x * k_i mod prime) mod numEntries` + * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*` * * Reference: * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]] diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala index 78876140677aa..1b524c6710b42 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala @@ -39,7 +39,8 @@ import org.apache.spark.sql.types.StructType private[ml] trait RandomProjectionParams extends Params { /** - * The length of each hash bucket, a larger bucket lowers the false negative rate. + * The length of each hash bucket, a larger bucket lowers the false negative rate. The number of + * buckets will be `(max L2 norm of input vectors) / bucketLength`. * * * If input vectors are normalized, 1-10 times of pow(numRecords, -1/inputDim) would be a @@ -57,7 +58,12 @@ private[ml] trait RandomProjectionParams extends Params { /** * :: Experimental :: * - * Model produced by [[RandomProjection]] + * Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors + * are normalized to be unit vectors and each vector is used in a hash function: + * `h_i(x) = floor(r_i.dot(x) / bucketLength)` + * where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input + * vectors) / bucketLength`. + * * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function. */ @Experimental