-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupe.pl
146 lines (136 loc) · 3.9 KB
/
dupe.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/perl
# reads in tfidf data for files and uses this to quickly
# find all duplicate pairs.
#
# Should only call this from makefile; "make killdupes"
#
# External output is placed in two files; one is
# "dupescr" script containing the appropriate dockill commands,
# and the other is a "dupelog" that gets added on to the
# global log ./DUPELOG
use strict;
use warnings;
binmode STDOUT, ":encoding(iso-8859-1)";
binmode STDERR, ":encoding(iso-8859-1)";
binmode STDIN, ":bytes";
my $base='/snapshot/aimsigh/TFIDF';
my $dummy;
my $crubadan='/usr/local/share/crubadan/ga';
my %bighash; # hash of hashes
my $N;
my $SPECIAL=10; # number of unusual terms to index for each doc
my $VERYSPECIAL=4; # how many among the most unusual terms must appear
# among the SPECIAL terms of another doc for them
# to be treated as dupe candidates
open (TORTHAI, ">", "./dupescr") or die "Could not open script file dupescr: $!\n";
open (LOGCHOMHAD, ">", "./dupelog") or die "Could not open log file dupelog: $!\n";
print "Logs opened...\n";
sub make_a_decision
{
(my $docnum, my $cand, my $cosine) = @_;
if ($cosine > 0.999) {
print "sim($docnum,$cand)=$cosine\n";
print TORTHAI "dockill $cand\n";
my $url1;
my $url2;
open (INFO, "<", "$crubadan/sonrai/$docnum.dat") or die "Could not open data file $docnum: $!\n";
while (<INFO>) {
chomp;
if (m/^url: /) {
$url1 = $_;
$url1 =~ s/^url: //;
}
}
close INFO;
open (INFO2, "<", "$crubadan/sonrai/$cand.dat") or die "Could not open data file $cand: $!\n";
while (<INFO2>) {
chomp;
if (m/^url: /) {
$url2 = $_;
$url2 =~ s/^url: //;
}
}
close INFO2;
print LOGCHOMHAD "$url1\n$url2\n\n";
}
}
print "About to open MANIFEST...\n";
open (MANIFEST, "<", "$crubadan/MANIFEST") or die "Could not open MANIFEST: $!\n";
$dummy = <MANIFEST>; # eat num. lines
print "Opened, numlines chomped, beginning to process tfidf files...\n";
while (<MANIFEST>) {
chomp;
(my $dummy, my $doctxt) = m/^([^ ]+) ([0-9]+\.txt)$/;
(my $docnum) = $doctxt =~ /^([0-9]+)/;
$N++;
print "$N..." if ($N % 100 == 0);
open (FOINSE, "<", "$base/$doctxt") or die "Could not open source file $doctxt: $!\n";
for (1..$SPECIAL) {
my $w = <FOINSE>;
if (defined($w)) { # in case of very short files!
chomp $w;
$w =~ s/^[^ ]+ //;
$bighash{$w}->{$docnum}++;
}
}
close FOINSE;
}
close MANIFEST;
print "Done reading top-tens from tfidf files...\n";
$N=0;
print "Now rereading document list and looking for dupes...\n";
# open (MANIFEST, "<", "$crubadan/MANIFEST") or die "Could not open MANIFEST: $!\n";
open (ANCIU, "<", "$crubadan/ANCIU") or die "Could not open doclist: $!\n";
#$dummy = <MANIFEST>; # eat num. lines
#while (<MANIFEST>) {
while (<ANCIU>) {
chomp;
# (my $dummy, my $doctxt) = m/^([^ ]+) ([0-9]+\.txt)$/;
my $doctxt = "$_.txt";
$N++;
print "$N..." if ($N % 100 == 0);
my $docnum = $doctxt;
$docnum =~ s/\.txt//;
my %tfidf;
my %cands;
my @todo;
open (FOINSE, "<", "$base/$doctxt") or die "Could not open source file $doctxt: $!\n";
for (1..$VERYSPECIAL) {
my $line = <FOINSE>;
if (defined $line) { # VERY short token list
chomp $line;
$line =~ /^([^ ]+) (.*)$/;
my $w = $2;
$tfidf{$w} = $1;
$cands{$_}++ foreach (keys %{$bighash{$w}});
}
}
foreach (keys %cands) {
push @todo, $_ if ($cands{$_}==$VERYSPECIAL and $docnum < $_);
}
if (scalar @todo > 0) {
while (<FOINSE>) {
chomp;
/^([^ ]+) (.*)$/;
$tfidf{$2} = $1;
}
foreach my $cand (@todo) {
open (FOINSEEILE, "<", "$base/$cand.txt") or die "Could not open source file $cand: $!\n";
my $cosine=0;
while (<FOINSEEILE>) {
chomp;
(my $val, my $w) = /^([^ ]+) (.*)$/;
$cosine += $val*($tfidf{$w}) if (exists($tfidf{$w}));
}
close FOINSEEILE;
make_a_decision($docnum, $cand, $cosine);
}
}
close FOINSE;
}
#close MANIFEST;
close ANCIU;
print TORTHAI "togail ga cman\n";
close TORTHAI;
close LOGCHOMHAD;
exit 0;