apache · hhbyyh · Jun 15, 2016 · Jun 16, 2016 · Jul 10, 2016 · Jul 11, 2016
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.fpm
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.fpm.{AssociationRules => MLlibAssociationRules,
+  FPGrowth => MLlibFPGrowth}
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+ * Common params for FPGrowth and FPGrowthModel
+ */
+private[fpm] trait FPGrowthParams extends Params with HasFeaturesCol with HasPredictionCol {
+
+  /**
+   * Minimal support level of the frequent pattern. [0.0, 1.0]. Any pattern that appears
+   * more than (minSupport * size-of-the-dataset) times will be output
+   * Default: 0.3
+   * @group param
+   */
+  @Since("2.2.0")
+  val minSupport: DoubleParam = new DoubleParam(this, "minSupport",
+    "the minimal support level of a frequent pattern",
+    ParamValidators.inRange(0.0, 1.0))
+  setDefault(minSupport -> 0.3)
+
+  /** @group getParam */
+  @Since("2.2.0")
+  def getMinSupport: Double = $(minSupport)
+
+  /**
+   * Number of partitions (>=1) used by parallel FP-growth. By default the param is not set, and
+   * partition number of the input dataset is used.
+   * @group expertParam
+   */
+  @Since("2.2.0")
+  val numPartitions: IntParam = new IntParam(this, "numPartitions",
+    "Number of partitions used by parallel FP-growth", ParamValidators.gtEq[Int](1))
+
+  /** @group expertGetParam */
+  @Since("2.2.0")
+  def getNumPartitions: Int = $(numPartitions)
+
+  /**
+   * Minimal confidence for generating Association Rule.
+   * Note that minConfidence has no effect during fitting.
+   * Default: 0.8
+   * @group param
+   */
+  @Since("2.2.0")
+  val minConfidence: DoubleParam = new DoubleParam(this, "minConfidence",
+    "minimal confidence for generating Association Rule",
+    ParamValidators.inRange(0.0, 1.0))
+  setDefault(minConfidence -> 0.8)
+
+  /** @group getParam */
+  @Since("2.2.0")
+  def getMinConfidence: Double = $(minConfidence)
+
+  /**
+   * Validates and transforms the input schema.
+   * @param schema input schema
+   * @return output schema
+   */
+  @Since("2.2.0")
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    val inputType = schema($(featuresCol)).dataType
+    require(inputType.isInstanceOf[ArrayType],
+      s"The input column must be ArrayType, but got $inputType.")
+    SchemaUtils.appendColumn(schema, $(predictionCol), schema($(featuresCol)).dataType)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
+ * <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
+ * Recommendation</a>. PFP distributes computation in such a way that each worker executes an
+ * independent group of mining tasks. The FP-Growth algorithm is described in
+ * <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
+ * candidate generation</a>. Note null values in the feature column are ignored during fit().
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning">
+ * Association rule learning (Wikipedia)</a>
+ */
+@Since("2.2.0")
+@Experimental
+class FPGrowth @Since("2.2.0") (
+    @Since("2.2.0") override val uid: String)
+  extends Estimator[FPGrowthModel] with FPGrowthParams with DefaultParamsWritable {
+
+  @Since("2.2.0")
+  def this() = this(Identifiable.randomUID("fpgrowth"))
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setMinSupport(value: Double): this.type = set(minSupport, value)
+
+  /** @group expertSetParam */
+  @Since("2.2.0")
+  def setNumPartitions(value: Int): this.type = set(numPartitions, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setMinConfidence(value: Double): this.type = set(minConfidence, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  @Since("2.2.0")
+  override def fit(dataset: Dataset[_]): FPGrowthModel = {
+    transformSchema(dataset.schema, logging = true)
+    genericFit(dataset)
+  }
+
+  private def genericFit[T: ClassTag](dataset: Dataset[_]): FPGrowthModel = {
+    val data = dataset.select($(featuresCol))
+    val items = data.where(col($(featuresCol)).isNotNull).rdd.map(r => r.getSeq[T](0).toArray)
+    val mllibFP = new MLlibFPGrowth().setMinSupport($(minSupport))
+    if (isSet(numPartitions)) {
+      mllibFP.setNumPartitions($(numPartitions))
+    }
+    val parentModel = mllibFP.run(items)
+    val rows = parentModel.freqItemsets.map(f => Row(f.items, f.freq))
+
+    val schema = StructType(Seq(
+      StructField("items", dataset.schema($(featuresCol)).dataType, nullable = false),
+      StructField("freq", LongType, nullable = false)))
+    val frequentItems = dataset.sparkSession.createDataFrame(rows, schema)
+    copyValues(new FPGrowthModel(uid, frequentItems)).setParent(this)
+  }
+
+  @Since("2.2.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  @Since("2.2.0")
+  override def copy(extra: ParamMap): FPGrowth = defaultCopy(extra)
+}
+
+
+@Since("2.2.0")
+object FPGrowth extends DefaultParamsReadable[FPGrowth] {
+
+  @Since("2.2.0")
+  override def load(path: String): FPGrowth = super.load(path)
+}
+
+/**
+ * :: Experimental ::
+ * Model fitted by FPGrowth.
+ *
+ * @param freqItemsets frequent items in the format of DataFrame("items"[Seq], "freq"[Long])
+ */
+@Since("2.2.0")
+@Experimental
+class FPGrowthModel private[ml] (
+    @Since("2.2.0") override val uid: String,
+    @transient val freqItemsets: DataFrame)
+  extends Model[FPGrowthModel] with FPGrowthParams with MLWritable {
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setMinConfidence(value: Double): this.type = set(minConfidence, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /**
+   * Get association rules fitted by AssociationRules using the minConfidence. Returns a dataframe
+   * with three fields, "antecedent", "consequent" and "confidence", where "antecedent" and
+   * "consequent" are Array[T] and "confidence" is Double.
+   */
+  @Since("2.2.0")
+  @transient lazy val associationRules: DataFrame = {
+    val freqItems = freqItemsets
+    AssociationRules.getAssociationRulesFromFP(freqItems, "items", "freq", $(minConfidence))
+  }
+
+  /**
+   * The transform method first generates the association rules according to the frequent itemsets.
+   * Then for each association rule, it will examine the input items against antecedents and
+   * summarize the consequents as prediction. The prediction column has the same data type as the
+   * input column(Array[T]) and will not contain existing items in the input column.
+   * Note that internally it uses Cartesian join and may exhaust memory for large datasets. The null
+   * values in the feature columns are treated as empty sets.
+   */
+  @Since("2.2.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    transformSchema(dataset.schema, logging = true)
+    genericTransform(dataset)
+  }
+
+  private def genericTransform[T](dataset: Dataset[_]): DataFrame = {
+    // use index to perform the join and aggregateByKey, and keep the original order after join.
+    val indexToItems = dataset.select($(featuresCol)).rdd.map(r => r.getSeq[T](0))
+      .zipWithIndex().map(_.swap)
+    val rulesRDD = associationRules.select("antecedent", "consequent").rdd
+      .map(r => (r.getSeq[T](0), r.getSeq[T](1)))
+
+    val indexToConsequents = indexToItems.cartesian(rulesRDD).map {
+      case ((id, items), (antecedent, consequent)) =>
+        val consequents = if (items != null) {
+          val itemSet = items.toSet
+          if (antecedent.forall(itemSet.contains)) {
+            consequent.filterNot(itemSet.contains)
+          } else {
+            Seq.empty
+          }
+        } else {
+          Seq.empty
+        }
+        (id, consequents)
+    }.aggregateByKey(new ArrayBuffer[T])((ar, seq) => ar ++= seq, (ar, seq) => ar ++= seq)
+     .map { case (index, cons) => (index, cons.distinct) }
+
+    val rowAndConsequents = dataset.toDF().rdd.zipWithIndex().map(_.swap)
+      .join(indexToConsequents)
+      .map(_._2).map(t => Row.merge(t._1, Row(t._2)))
+    val mergedSchema = dataset.schema.add(StructField($(predictionCol),
+      dataset.schema($(featuresCol)).dataType, dataset.schema($(featuresCol)).nullable))
+    dataset.sparkSession.createDataFrame(rowAndConsequents, mergedSchema)
+  }
+
+  @Since("2.2.0")
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  @Since("2.2.0")
+  override def copy(extra: ParamMap): FPGrowthModel = {
+    val copied = new FPGrowthModel(uid, freqItemsets)
+    copyValues(copied, extra).setParent(this.parent)
+  }
+
+  @Since("2.2.0")
+  override def write: MLWriter = new FPGrowthModel.FPGrowthModelWriter(this)
+}
+
+@Since("2.2.0")
+object FPGrowthModel extends MLReadable[FPGrowthModel] {
+
+  @Since("2.2.0")
+  override def read: MLReader[FPGrowthModel] = new FPGrowthModelReader
+
+  @Since("2.2.0")
+  override def load(path: String): FPGrowthModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[FPGrowthModel]] */
+  private[FPGrowthModel]
+  class FPGrowthModelWriter(instance: FPGrowthModel) extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val dataPath = new Path(path, "data").toString
+      instance.freqItemsets.write.parquet(dataPath)
+    }
+  }
+
+  private class FPGrowthModelReader extends MLReader[FPGrowthModel] {
+
+    /** Checked against metadata when loading model */
+    private val className = classOf[FPGrowthModel].getName
+
+    override def load(path: String): FPGrowthModel = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+      val dataPath = new Path(path, "data").toString
+      val frequentItems = sparkSession.read.parquet(dataPath)
+      val model = new FPGrowthModel(metadata.uid, frequentItems)
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+}
+
+private[fpm] object AssociationRules {
+
+  /**
+   * Computes the association rules with confidence above minConfidence.
+   * @param dataset DataFrame("items", "freq") containing frequent itemset obtained from
+   *                algorithms like [[FPGrowth]].
+   * @param itemsCol column name for frequent itemsets
+   * @param freqCol column name for frequent itemsets count
+   * @param minConfidence minimum confidence for the result association rules
+   * @return a DataFrame("antecedent", "consequent", "confidence") containing the association
+   *         rules.
+   */
+  def getAssociationRulesFromFP[T: ClassTag](
+        dataset: Dataset[_],
+        itemsCol: String,
+        freqCol: String,
+        minConfidence: Double): DataFrame = {
+
+    val freqItemSetRdd = dataset.select(itemsCol, freqCol).rdd
+      .map(row => new FreqItemset(row.getSeq[T](0).toArray, row.getLong(1)))
+    val rows = new MLlibAssociationRules()
+      .setMinConfidence(minConfidence)
+      .run(freqItemSetRdd)
+      .map(r => Row(r.antecedent, r.consequent, r.confidence))
+
+    val dt = dataset.schema(itemsCol).dataType
+    val schema = StructType(Seq(
+      StructField("antecedent", dt, nullable = false),
+      StructField("consequent", dt, nullable = false),
+      StructField("confidence", DoubleType, nullable = false)))
+    val rules = dataset.sparkSession.createDataFrame(rows, schema)
+    rules
+  }
+}