Skip to content

Commit 651a1c0

Browse files
Jacky Limengxr
authored andcommitted
[SPARK-5939][MLLib] make FPGrowth example app take parameters
Add parameter parsing in FPGrowth example app in Scala and Java And a sample data file is added in data/mllib folder Author: Jacky Li <[email protected]> Closes #4714 from jackylk/parameter and squashes the following commits: 8c478b3 [Jacky Li] fix according to comments 3bb74f6 [Jacky Li] make FPGrowth exampl app take parameters f0e4d10 [Jacky Li] make FPGrowth exampl app take parameters
1 parent 242d495 commit 651a1c0

File tree

3 files changed

+81
-27
lines changed

3 files changed

+81
-27
lines changed

data/mllib/sample_fpgrowth.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
r z h k p
2+
z y x w v u t s
3+
s x o n r
4+
x z y m t s q e
5+
z
6+
x z y r q t p

examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,32 +25,49 @@
2525
import org.apache.spark.SparkConf;
2626
import org.apache.spark.api.java.JavaRDD;
2727
import org.apache.spark.api.java.JavaSparkContext;
28+
import org.apache.spark.api.java.function.Function;
2829
import org.apache.spark.mllib.fpm.FPGrowth;
2930
import org.apache.spark.mllib.fpm.FPGrowthModel;
3031

3132
/**
3233
* Java example for mining frequent itemsets using FP-growth.
34+
* Example usage: ./bin/run-example mllib.JavaFPGrowthExample ./data/mllib/sample_fpgrowth.txt
3335
*/
3436
public class JavaFPGrowthExample {
3537

3638
public static void main(String[] args) {
39+
String inputFile;
40+
double minSupport = 0.3;
41+
int numPartition = -1;
42+
if (args.length < 1) {
43+
System.err.println(
44+
"Usage: JavaFPGrowth <input_file> [minSupport] [numPartition]");
45+
System.exit(1);
46+
}
47+
inputFile = args[0];
48+
if (args.length >= 2) {
49+
minSupport = Double.parseDouble(args[1]);
50+
}
51+
if (args.length >= 3) {
52+
numPartition = Integer.parseInt(args[2]);
53+
}
54+
3755
SparkConf sparkConf = new SparkConf().setAppName("JavaFPGrowthExample");
3856
JavaSparkContext sc = new JavaSparkContext(sparkConf);
3957

58+
JavaRDD<ArrayList<String>> transactions = sc.textFile(inputFile).map(
59+
new Function<String, ArrayList<String>>() {
60+
@Override
61+
public ArrayList<String> call(String s) {
62+
return Lists.newArrayList(s.split(" "));
63+
}
64+
}
65+
);
4066

41-
// TODO: Read a user-specified input file.
42-
@SuppressWarnings("unchecked")
43-
JavaRDD<ArrayList<String>> transactions = sc.parallelize(Lists.newArrayList(
44-
Lists.newArrayList("r z h k p".split(" ")),
45-
Lists.newArrayList("z y x w v u t s".split(" ")),
46-
Lists.newArrayList("s x o n r".split(" ")),
47-
Lists.newArrayList("x z y m t s q e".split(" ")),
48-
Lists.newArrayList("z".split(" ")),
49-
Lists.newArrayList("x z y r q t p".split(" "))), 2);
50-
51-
FPGrowth fpg = new FPGrowth()
52-
.setMinSupport(0.3);
53-
FPGrowthModel<String> model = fpg.run(transactions);
67+
FPGrowthModel<String> model = new FPGrowth()
68+
.setMinSupport(minSupport)
69+
.setNumPartitions(numPartition)
70+
.run(transactions);
5471

5572
for (FPGrowth.FreqItemset<String> s: model.freqItemsets().toJavaRDD().collect()) {
5673
System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());

examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,61 @@
1717

1818
package org.apache.spark.examples.mllib
1919

20+
import scopt.OptionParser
21+
2022
import org.apache.spark.mllib.fpm.FPGrowth
21-
import org.apache.spark.{SparkContext, SparkConf}
23+
import org.apache.spark.{SparkConf, SparkContext}
2224

2325
/**
2426
* Example for mining frequent itemsets using FP-growth.
27+
* Example usage: ./bin/run-example mllib.FPGrowthExample \
28+
* --minSupport 0.8 --numPartition 2 ./data/mllib/sample_fpgrowth.txt
2529
*/
2630
object FPGrowthExample {
2731

32+
case class Params(
33+
input: String = null,
34+
minSupport: Double = 0.3,
35+
numPartition: Int = -1) extends AbstractParams[Params]
36+
2837
def main(args: Array[String]) {
29-
val conf = new SparkConf().setAppName("FPGrowthExample")
38+
val defaultParams = Params()
39+
40+
val parser = new OptionParser[Params]("FPGrowthExample") {
41+
head("FPGrowth: an example FP-growth app.")
42+
opt[Double]("minSupport")
43+
.text(s"minimal support level, default: ${defaultParams.minSupport}")
44+
.action((x, c) => c.copy(minSupport = x))
45+
opt[Int]("numPartition")
46+
.text(s"number of partition, default: ${defaultParams.numPartition}")
47+
.action((x, c) => c.copy(numPartition = x))
48+
arg[String]("<input>")
49+
.text("input paths to input data set, whose file format is that each line " +
50+
"contains a transaction with each item in String and separated by a space")
51+
.required()
52+
.action((x, c) => c.copy(input = x))
53+
}
54+
55+
parser.parse(args, defaultParams).map { params =>
56+
run(params)
57+
}.getOrElse {
58+
sys.exit(1)
59+
}
60+
}
61+
62+
def run(params: Params) {
63+
val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
3064
val sc = new SparkContext(conf)
65+
val transactions = sc.textFile(params.input).map(_.split(" ")).cache()
66+
67+
println(s"Number of transactions: ${transactions.count()}")
68+
69+
val model = new FPGrowth()
70+
.setMinSupport(params.minSupport)
71+
.setNumPartitions(params.numPartition)
72+
.run(transactions)
3173

32-
// TODO: Read a user-specified input file.
33-
val transactions = sc.parallelize(Seq(
34-
"r z h k p",
35-
"z y x w v u t s",
36-
"s x o n r",
37-
"x z y m t s q e",
38-
"z",
39-
"x z y r q t p").map(_.split(" ")), numSlices = 2)
40-
41-
val fpg = new FPGrowth()
42-
.setMinSupport(0.3)
43-
val model = fpg.run(transactions)
74+
println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")
4475

4576
model.freqItemsets.collect().foreach { itemset =>
4677
println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)

0 commit comments

Comments
 (0)