-
Notifications
You must be signed in to change notification settings - Fork 0
/
TopNWindow.scala
91 lines (75 loc) · 2.79 KB
/
TopNWindow.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
/**
* This application reads a stream of words and outputs for every window the three
* words with most occurrences (= top 3).
*
* This program connects to a server socket and reads strings from the socket.
* The easiest way to try this out is to open a text sever (at port 9000)
* using the ''netcat'' tool via
* {{{
* nc -l -p 9000
* }}}
*/
object TopNWindow {
/** Main program method */
def main(args: Array[String]): Unit = {
// the host and the port to connect to
var hostname: String = "localhost"
var port: Int = 9000
/** Data type for words with count */
case class WordWithCount(word: String, count: Long)
// get the execution environment
val env: StreamExecutionEnvironment =
StreamExecutionEnvironment.getExecutionEnvironment
// get input data by connecting to the socket
val text: DataStream[String] = env.socketTextStream(hostname, port, '\n')
class Top3WindowFunction
extends WindowFunction[WordWithCount, String, String, TimeWindow] {
def apply(key: String,
window: TimeWindow,
input: Iterable[WordWithCount],
out: Collector[String]): Unit = {
val tmp: Map[String, Iterable[WordWithCount]] =
input.groupBy(t => t.word)
val tmp2: Seq[(String, Int)] =
tmp.mapValues(_.size).toSeq.sortBy(-_._2).toList.take(3)
val res = tmp2.toString
out.collect(res)
}
}
// parse the data, group it, window it, and aggregate the counts
val windowCounts: WindowedStream[WordWithCount, String, TimeWindow] = text
.flatMap { w =>
w.split("\\s")
}
.map { w =>
WordWithCount(w, 1)
}
.keyBy(t => "all")
.window(
SlidingProcessingTimeWindows.of(Time.seconds(30), Time.seconds(10)))
def top3(s: String,
tw: TimeWindow,
//input: Iterable[(String, Int)],
input: Iterable[WordWithCount],
co: Collector[String]): Unit = {
{
val tmp: Map[String, Iterable[WordWithCount]] =
input.groupBy(t => t.word)
val tmp2: Seq[(String, Int)] =
tmp.mapValues(_.size).toSeq.sortBy(-_._2).toList.take(3)
val res = tmp2.toString
co.collect(res)
}
}
// val aggr = windowCounts.apply { top3 _ }
val aggr = windowCounts.apply(new Top3WindowFunction())
aggr.print().setParallelism(1)
env.execute("Top-N-Window")
}
}