-
Notifications
You must be signed in to change notification settings - Fork 0
/
DBSCAN.java
187 lines (173 loc) · 5.24 KB
/
DBSCAN.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.TreeSet;
public class DBSCAN {
private ArrayList<Cluster> clusters;
private String[] reference;
private Collection<DocumentObject> docs;
private int minPts;
private double epsilon;
private int[] type;//1=Core, 2 = Border, 3 = Noise
public DBSCAN(int minPts, double epsilon, Set<String> wordsSet, Collection<DocumentObject> docs){
clusters = new ArrayList<Cluster>();
this.reference = new String[wordsSet.size()];
this.getWordsReference(wordsSet);
this.docs=docs;
this.minPts=minPts;
this.epsilon=epsilon;
this.type = new int[docs.size()];
}
public void execute(){
DocumentObject[] docArray= this.docs.toArray(new DocumentObject[this.docs.size()]);
ArrayList<Integer> comeBackTo = new ArrayList<Integer>();
ArrayList<Double> point;
DocumentObject DOC;
int p = 0;
double dist = 0.0;
boolean isBorder = false;
Set<Integer> seen= new TreeSet<Integer>();
//For every point
System.out.println("Getting initial point types");
for(int i = 0; i < docArray.length; i++){
//Look at all other points, count # of points within epsilon
p=0;
DOC = docArray[i];
point = getPoint(DOC.getWords());
for(int j = 0; j < docArray.length; j++){
if(j!=i){
dist = calculateDist(point, getPoint(docArray[j].getWords()),true);
if(dist <= this.epsilon){p++;}
}
}
//if >= minpoints, mark as core
if(p >= this.minPts){
this.type[i] = 1;
//else if 0 mark as Noise
}else if(p == 0){
this.type[i] = 3;
//else mark as TBD (could be border or Noise point)
}else{
comeBackTo.add(i);
}
}
//To do, check all points within epsilon. If any is core point, mark as Border. Else mark as Noise
System.out.println("Addressing possible border/noise points");
for(int i = 0; i < comeBackTo.size(); i++){
isBorder = false;
p=comeBackTo.get(i);
DOC = docArray[p];
point = getPoint(DOC.getWords());
for(int j = 0; j < docArray.length; j++){
if(j!=i){
dist = calculateDist(point, getPoint(docArray[j].getWords()),true);
if(dist <= this.epsilon && this.type[j] == 1){
isBorder=true;
this.type[p] = 2;
break;
}
}
}
if(!isBorder){
this.type[p] = 3;
}
}
System.out.println("Splitting into clusters...");
while(seen.size() < docArray.length){
this.clusters.add(new Cluster(this.reference.length));
clusterDocs(docArray,seen,min(seen),0);
}
System.out.println("Done.");
// System.out.print("DATA DUMP: [");
// for(int i = 0; i < this.type.length;i++){
// System.out.print(this.type[i]+",");
// }
// System.out.println("]");
}
private int min(Set<Integer> seen) {
int i = 0;
while(seen.contains(i)){i++;}
return i;
}
//get reference words for use
private void getWordsReference(Set<String> words){
Iterator<String> it = words.iterator();
int i = 0;
while(it.hasNext()){
this.reference[i] = it.next();
i++;
}
}
//return an n dimensional binary point of words for a given document.
private ArrayList<Double> getPoint(Queue<String> documentWords){
ArrayList<Double> point = new ArrayList<Double>();
for(int i = 0; i < this.reference.length; i++){
if(documentWords.contains(this.reference[i])){
point.add(i, 1.0);//getNumberOfWords(documentWords,this.reference.get(i))
}else{point.add(i, 0.0);}
}
return point;
}
private static double calculateDist(ArrayList<Double> point, ArrayList<Double> point2, boolean euclidian){
double distance = Double.MAX_VALUE, difference;
int count = 0;
if(euclidian){
for(int i = 0; i < point.size(); i++){
difference = point.get(i) - point2.get(i);
count += difference * difference;
}
distance = Math.sqrt(count + 0.0);
}else{
for(int i = 0; i < point.size(); i++){
difference = Math.abs(point.get(i) - point2.get(i));
count += difference;
}
distance = count;
}
return distance;
}
public ArrayList<Cluster> getClusters(){return this.clusters;}
//Type determines noise/core+border cluster. 2=Noise, 1=Core+border, 0=unknown - Find out
private void clusterDocs(DocumentObject[] docArray, Set<Integer> seen, int i, int type){
if(seen.size() < docArray.length && !seen.contains(i)){
DocumentObject DOC;
Queue<Integer> neighbors = new LinkedList<Integer>();
ArrayList<Double> point;
double dist = 0.0;
int a=0,t=type;
if(type == 0){
if(this.type[i] == 3){t=2;}else{t=1;}
}
DOC = docArray[i];
point = getPoint(DOC.getWords());
this.clusters.get(this.clusters.size()-1).add(convert(point), DOC.getID());
seen.add(i);
for(int j = 0; j < docArray.length; j++){
if(j!=i){
dist = calculateDist(point, getPoint(docArray[j].getWords()),true);
if(dist <= this.epsilon){
if((t==2 && this.type[j] == 3) || (t==1 && (this.type[j]==1 || this.type[j] == 2))){
neighbors.add(j);
}
}
}
}
while(neighbors.size() != 0){
a=neighbors.poll();
clusterDocs(docArray,seen,a,t);
}
}
}
private static ArrayList<Integer> convert(ArrayList<Double> point){
ArrayList<Integer> a= new ArrayList<Integer>();
double d;
for(int i = 0; i < point.size(); i++){
d=point.get(i);
a.add((int)d);
}
return a;
}
}