Skip to content

Commit

Permalink
Aggiunto lo split del lexicon
Browse files Browse the repository at this point in the history
  • Loading branch information
GiacomoManzoli committed Dec 23, 2016
1 parent 7db84e6 commit 3dd8acf
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 106 deletions.
7 changes: 5 additions & 2 deletions experiment_1.properties
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
experiment.name = e10kOLD_FASHION
experiment.name = e10kNEW_FASHION_2

experiment.pipeline = clustering, save_graph_data, save_history, merge_history, YASS_stemming

# Lexicon
experiment.lexicon_path = ./lexicon/italian/AGZ1994.txt
# opzionali, ma devono essere presenti entrambi
#experiment.lexicon_range.start = 52600
#experiment.lexicon_range.end = 53000
experiment.lexicon_range.start = 50000
experiment.lexicon_range.end = 50500
experiment.lexicon_range.end = 60000

# Riduzione dei dati
experiment.stopwords = ./lexicon/italian/stoplist/stoplist2.txt
experiment.discard_numbers = true
experiment.allow_split = true

experiment.distance = d3
experiment.thresholds = 0.2,0.4,0.6
Expand Down
15 changes: 11 additions & 4 deletions src/bm/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ private static void saveMergeHistory(String expName, String distanceName, List<M
new FileOutputStream(filePath), "utf-8"))) {
// f.write("%s,%s,%s,%s\n" % (str(m.c1), str(m.c2), str(m.dist), str(m.cnt)))
for (MergeHistoryRecord m : mergeHistory){
writer.write(m.getC1()+ "," + m.getC2()+","+m.getDist()+","+m.getCnt()+"\n");
writer.write(m.getC1()+ "," + m.getC2()+","+m.getCres() +","+m.getDist()+","+m.getCnt()+"\n");
}
} catch (Exception e){
System.err.println(e.getMessage());
Expand All @@ -99,8 +99,9 @@ private static List<MergeHistoryRecord> loadMergeHistory(String expName, String
history.add(new MergeHistoryRecord(
Integer.parseInt(parts[0]),
Integer.parseInt(parts[1]),
Float.parseFloat(parts[2]),
Integer.parseInt(parts[3])
Integer.parseInt(parts[2]),
Float.parseFloat(parts[3]),
Integer.parseInt(parts[4])
));
}
in.close();
Expand Down Expand Up @@ -216,7 +217,13 @@ public static void main(String[] args) {
List<MergeHistoryRecord> mergeHistory = new ArrayList<>();
if (exp.getPipeline().contains(PIPE_CLUSTERING)) {
System.out.println("Eseguo l'algoritmo di clustering con la misura "+exp.getDistanceMeasure().getName());
mergeHistory = HierarchicalClustering.calculateClusters(exp.getDistanceMeasure(), lexicon);

if (exp.isSplitAllowed()){
mergeHistory = HierarchicalClustering.calculateClustersSplitting(exp.getDistanceMeasure(), lexicon);
} else {
mergeHistory = HierarchicalClustering.calculateClustersSplitting(exp.getDistanceMeasure(), lexicon);
}

System.out.println("Completato clustering! Tempo trascorso: " + (System.currentTimeMillis() - startTime)/1000);

if (exp.getPipeline().contains(PIPE_SAVE_GRAPH_DATA)) {
Expand Down
199 changes: 107 additions & 92 deletions src/bm/clustering/HierarchicalClustering.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@ public static List<MergeHistoryRecord> calculateClusters(DistanceMeasure d, List
}
// r < s
newClusters.add(Cluster.merge(nextId, manager.getCluster(r), manager.getCluster(s)));
mergedCluster.add(r);
mergedCluster.add(s);
nextId++;

historyRecords.add(new MergeHistoryRecord(
manager.getCluster(r).getId(),
manager.getCluster(s).getId(),
nextId,
pair.getDist(),
manager.size() - newClusters.size())
);
mergedCluster.add(r);
mergedCluster.add(s);
nextId++;

}

Expand Down Expand Up @@ -98,7 +98,7 @@ public static List<MergeHistoryRecord> calculateClusters(DistanceMeasure d, List
}

@SuppressWarnings("Duplicates")
public List<MergeHistoryRecord> calculateClustersSplitting(DistanceMeasure d, List<String> words) {
public static List<MergeHistoryRecord> calculateClustersSplitting(DistanceMeasure d, List<String> words) {
// Divido per lettera inziale e faccio il clustering per ogni sottoinsieme.

final int TOTAL_NUMER_OF_STARTING_CLUTERS = words.size();
Expand All @@ -121,90 +121,46 @@ public List<MergeHistoryRecord> calculateClustersSplitting(DistanceMeasure d, Li
* */


for (int wordIndex = 0; wordIndex < words.size(); wordIndex++) {
int startIndex= 0;
for (int startIndex = 0; startIndex < words.size(); ) {
//int startIndex= 0;
Character currentChar = words.get(startIndex).charAt(0);
int q = 1;
while (words.get(startIndex + q).charAt(0) == currentChar)
while (startIndex + q < words.size() && words.get(startIndex + q).charAt(0) == currentChar)
q++;
int endIndex = startIndex +q; // indice della prima parola che inizia con una lettera diversa.
System.out.println("Clustering per le parole da "+ words.get(startIndex) + " a " + words.get(endIndex-1));
System.out.println("Clustering per le parole da "+ words.get(startIndex) + " a " + words.get(endIndex-1) + " ("+ (endIndex-startIndex )+")");
try {
System.out.println("Il prossimo partirà da: " + words.get(endIndex));
} catch (Exception e) {
System.out.println("Non ci sono ulteriori sub-set");
}
try {
@SuppressWarnings("unchecked")
List<String> wordsSubset = (ArrayList<String>) ((ArrayList<String>) words.subList(startIndex, endIndex)).clone();

int n = wordsSubset.size();
int printInterval = (int)Math.max(100, n*0.00005);

// Creo i cluster
List<Cluster> clusters = new ArrayList<>();
for (int i = nextIntialIndex; i < wordsSubset.size(); i++){
List<String> clusterWords = new ArrayList<>();
clusterWords.add(wordsSubset.get(i));
clusters.add(new Cluster(i, clusterWords));
}
nextIntialIndex = nextIntialIndex + wordsSubset.size();
ClusterManager manager = new ClusterManager(clusters, d);


int cntIter = 0;
long startTime = System.currentTimeMillis();
while (manager.size() != 1){
List<MinDistancePair> minDistancePairs = manager.findMinDistancePairs();
Set<Integer> mergedCluster = new HashSet<>();
List<Cluster> newClusters = new ArrayList<>();

for (MinDistancePair pair: minDistancePairs) {
int r = pair.getR();
int s = pair.getS();
if (mergedCluster.contains(r) || mergedCluster.contains(s)){
continue; // Se uno dei due indici è già stato mergiato, salto la coppia
}
newClusters.add(Cluster.merge(nextId, manager.getCluster(r), manager.getCluster(s)));
mergedCluster.add(r);
mergedCluster.add(s);
nextId++;

historyRecords.add(new MergeHistoryRecord(
manager.getCluster(r).getId(),
manager.getCluster(s).getId(),
pair.getDist(),
manager.size() - newClusters.size())
);

}
List<Integer> toDelete = new ArrayList<>();
toDelete.addAll(mergedCluster);
manager.deleteClusters(toDelete);
//
// IMPORTANTE: prima di effettuare l'inserimento devono essere stati eliminati i vecchi cluster, altrimenti
// non c'è posto all'interno della matrice delle distanze del manager.
//
for (Cluster c : newClusters) {
manager.insert(c);
}
//
// IMPORTANTE: questa operazione deve essere fatta DOPO tutte le cancellazioni e insierimenti
//
manager.resize();
cntIter++;
if (cntIter % (printInterval) == 0) {
System.out.println("Iterazione: " + cntIter + " numero di cluster presenti: "+ manager.size() +
" - Tempo trascorso: "+ (System.currentTimeMillis() - startTime)/1000 + " s");
}
}
System.out.println("Iterazioni necessarie: "+cntIter);


List<String> wordsSubset = words.subList(startIndex, endIndex);

} catch (ClassCastException e){
System.err.println(e.toString());
// Creo i cluster
List<Cluster> clusters = new ArrayList<>();
for (int i = 0; i < wordsSubset.size(); i++){
List<String> clusterWords = new ArrayList<>();
clusterWords.add(wordsSubset.get(i));
clusters.add(new Cluster(nextIntialIndex + i, clusterWords));
}
nextIntialIndex = nextIntialIndex + wordsSubset.size();
int lastCreatedId = nextId-1;
List<MergeHistoryRecord> newRecords = HierarchicalClustering.clusterer(d, clusters, nextId);
historyRecords.addAll(newRecords);
nextId += newRecords.size();
if (lastCreatedId != TOTAL_NUMER_OF_STARTING_CLUTERS*2 -2 ) { // alla prima iterazione non devo inserire il bridge
int c2;
if (newRecords.size() > 0){
c2 = newRecords.get(newRecords.size() -1).getCres(); // id dell'utlimo cluster creato
} else {
c2 = nextIntialIndex - 1; // è l'unico cluster con quella lettera iniziale
}
MergeHistoryRecord bridgeRecord = new MergeHistoryRecord(lastCreatedId, c2, nextId, Float.POSITIVE_INFINITY, 1);
nextId++;
historyRecords.add(bridgeRecord);
}
startIndex = endIndex;
}
// historyRecord contiene tutti i con i cluster, ma devo correggere gli indici.
// Come prima cosa ordino i record per distanza di merge.
Expand All @@ -215,26 +171,85 @@ public List<MergeHistoryRecord> calculateClustersSplitting(DistanceMeasure d, Li
Map<Integer, Integer> idMapping = new HashMap<>(); // <oldId, newId>
for (int j = 0; j < historyRecords.size(); j++){
MergeHistoryRecord current = historyRecords.get(j);
// Aggiunto l'indice c1.
if (idMapping.containsKey(current.getC1())){
current.setC1(idMapping.get(current.getC1()));
} else {
assert current.getC1() >= TOTAL_NUMER_OF_STARTING_CLUTERS*2 -1;
idMapping.put(current.getC1(), nextCorrectId);
current.setC1(nextCorrectId);
nextCorrectId++;
current.setCnt(TOTAL_NUMER_OF_STARTING_CLUTERS - j -1);
idMapping.put(current.getCres(), nextCorrectId);
current.setCres(nextCorrectId);
nextCorrectId++;

// Aggiusto l'indice c1.
if (current.getC1() >= TOTAL_NUMER_OF_STARTING_CLUTERS) {
if (idMapping.containsKey(current.getC1())){
current.setC1(idMapping.get(current.getC1()));
} else {
assert false;
}
}
// Aggiunto l'indice c2.
if (idMapping.containsKey(current.getC2())){
current.setC1(idMapping.get(current.getC2()));
} else {
assert current.getC2() >= TOTAL_NUMER_OF_STARTING_CLUTERS*2 -1;
idMapping.put(current.getC2(), nextCorrectId);
current.setC2(nextCorrectId);
nextCorrectId++;
if (current.getC2() >= TOTAL_NUMER_OF_STARTING_CLUTERS) {
if (idMapping.containsKey(current.getC2())) {
current.setC2(idMapping.get(current.getC2()));
} else {
assert false;
}
}
}
assert nextCorrectId == TOTAL_NUMER_OF_STARTING_CLUTERS*2 -1;
return historyRecords;
}

@SuppressWarnings("Duplicates")
private static List<MergeHistoryRecord> clusterer(DistanceMeasure d, List<Cluster> clusters, int nextId) {
int printInterval = (int)Math.max(10, clusters.size()*0.00005);
ClusterManager manager = new ClusterManager(clusters, d);
int cntIter = 0;
long startTime = System.currentTimeMillis();
List<MergeHistoryRecord> historyRecords = new ArrayList<>();
while (manager.size() != 1){
List<MinDistancePair> minDistancePairs = manager.findMinDistancePairs();
Set<Integer> mergedCluster = new HashSet<>();
List<Cluster> newClusters = new ArrayList<>();

for (MinDistancePair pair: minDistancePairs) {
int r = pair.getR();
int s = pair.getS();
if (mergedCluster.contains(r) || mergedCluster.contains(s)){
continue; // Se uno dei due indici è già stato mergiato, salto la coppia
}
newClusters.add(Cluster.merge(nextId, manager.getCluster(r), manager.getCluster(s)));
historyRecords.add(new MergeHistoryRecord(
manager.getCluster(r).getId(),
manager.getCluster(s).getId(),
nextId,
pair.getDist(),
manager.size() - newClusters.size())
);
mergedCluster.add(r);
mergedCluster.add(s);
nextId++;

}
List<Integer> toDelete = new ArrayList<>();
toDelete.addAll(mergedCluster);
manager.deleteClusters(toDelete);
//
// IMPORTANTE: prima di effettuare l'inserimento devono essere stati eliminati i vecchi cluster, altrimenti
// non c'è posto all'interno della matrice delle distanze del manager.
//
for (Cluster c : newClusters) {
manager.insert(c);
}
//
// IMPORTANTE: questa operazione deve essere fatta DOPO tutte le cancellazioni e insierimenti
//
manager.resize();
cntIter++;
if (cntIter % (printInterval) == 0) {
System.out.println("Iterazione: " + cntIter + " numero di cluster presenti: "+ manager.size() +
" - Tempo trascorso: "+ (System.currentTimeMillis() - startTime)/1000 + " s");
}
}
System.out.println("Iterazioni necessarie: "+cntIter);
return historyRecords;
}
}

16 changes: 11 additions & 5 deletions src/bm/clustering/HistoryClusterBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,17 @@ public static List<ClusterSet> buildSetsFromHistory(List<String> words, List<Mer

Cluster cluster1 = clusterSet.getCluster(record.getC1());
Cluster cluster2 = clusterSet.getCluster(record.getC2());
Cluster merged = Cluster.merge(nextId, cluster1, cluster2);
nextId++;
clusterSet.removeCluster(record.getC1());
clusterSet.removeCluster(record.getC2());
clusterSet.addCluster(merged);
try {
Cluster merged = Cluster.merge(nextId, cluster1, cluster2);
nextId++;
clusterSet.removeCluster(record.getC1());
clusterSet.removeCluster(record.getC2());
clusterSet.addCluster(merged);
} catch (Exception e){
System.err.println(e.toString());
System.exit(123);
}

}

return snapshots;
Expand Down
22 changes: 19 additions & 3 deletions src/bm/clustering/MergeHistoryRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class MergeHistoryRecord implements Comparable{

private int c1;
private int c2;
private int cres;
private float dist;
private int cnt;

Expand All @@ -17,9 +18,10 @@ public class MergeHistoryRecord implements Comparable{
* @param dist distanza dei due cluster.
* @param cnt numero di cluster rimanenti dopo l'operazione di merge.
* */
public MergeHistoryRecord(int c1, int c2, float dist, int cnt) {
public MergeHistoryRecord(int c1, int c2, int cres, float dist, int cnt) {
this.c1 = c1;
this.c2 = c2;
this.cres = cres;
this.dist = dist;
this.cnt = cnt;
}
Expand All @@ -45,6 +47,7 @@ public String toString() {
return "MergeHistoryRecord{" +
"c1=" + c1 +
", c2=" + c2 +
", cres=" + cres +
", dist=" + dist +
", cnt=" + cnt +
'}';
Expand All @@ -59,6 +62,7 @@ public boolean equals(Object o) {

if (c1 != that.c1) return false;
if (c2 != that.c2) return false;
if (cres != that.cres) return false;
return Float.compare(that.dist, dist) == 0 && cnt == that.cnt;
}

Expand All @@ -70,10 +74,10 @@ public int compareTo(Object o) {
0 if this == that
a positive int if this > that
* */
if (this.equals(record)){
if (this.dist == record.dist){
return 0;
}
if (this.dist <= record.dist) {
if (this.dist < record.dist) {
return -1;
} else {
return 1;
Expand All @@ -86,4 +90,16 @@ public void setC1(Integer c1) {
public void setC2(Integer c2) {
this.c2 = c2;
}

public void setCnt(int cnt) {
this.cnt = cnt;
}

public int getCres() {
return cres;
}

public void setCres(int cres) {
this.cres = cres;
}
}
Loading

0 comments on commit 3dd8acf

Please sign in to comment.