-
Notifications
You must be signed in to change notification settings - Fork 0
/
pr.pl
123 lines (113 loc) · 3.42 KB
/
pr.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/perl
use strict;
use warnings;
# don't bother with PDL, matrices are too big for memory
# hash of all URLs to be ranked; value is the index in the big matrix
# Needed to create "linktome" matrix when hrefs are found in docs
my %byurl;
# get crubadan docId (100000..999999) given an index in big matrix
my @docIds;
# store number of links out of page, given index in big matrix
my @linksout;
# array of "sinks" == docs with no links out, stored as index in big matrix
my @sinks;
# array of arrays; $linktome[$i] is an array of indices of docs linking
# to the doc at index $i
my @linktome;
my $base='/usr/local/share/crubadan/ga';
my $q = 0.15; # Brin-Page damping constant
print "Reading MANIFEST for list of URLs...\n";
open(MANIFEST, "$base/MANIFEST") || die "Could not open manifest directory: $!\n";
while (<MANIFEST>) {
if (/^http/) {
chomp;
(my $url, my $docId) = m/^([^ ]+) ([0-9]+)\.txt$/;
$byurl{$url}=$.-2; # 1 is the count on first line of MANIFEST
push @docIds, $docId;
}
}
close MANIFEST;
my $N = scalar @docIds;
my $end=$N-1;
my @pr;
push @linktome, [] for (0..$end);
push @pr, 1/$N for (0..$end); # will be overwritten below
my $prtotal=0;
my $htmldocs=0;
my $all=0;
print "Reading cached files, extracting links and computing incidence matrix...\n";
for my $j (0..$end) {
open(DOCDATA, "<", "$base/sonrai/$docIds[$j].dat") or die "Could not open file $docIds[$j].dat: $!\n";
my $extension;
my $url;
my $pre_pr;
while (<DOCDATA>) {
chomp;
if (m/^url: /) {
$url = $_;
$url =~ s/^url: //;
# unless ($byurl{$url} == $j) {die}
}
if (m/^format: /) {
$extension = $_;
$extension =~ s/^format: //;
}
if (m/^pagerank: /) {
$pre_pr = $_;
$pre_pr =~ s/^pagerank: //;
$pr[$j] = $pre_pr;
# print "j=$j; setting initial pr for $docIds[$j] to $pr[$j]\n";
$prtotal += $pre_pr;
}
}
close DOCDATA;
$linksout[$j] = 0;
if ($extension eq 'html') {
my $realfile="$base/taisce/$docIds[$j].$extension";
$htmldocs++;
print "$htmldocs... " if ($htmldocs % 100 == 0);
# note: get_refs returns only uniq refs
# also a good good thing: get_refs normalizes all URLs
# to the form stored by crubadan, even if the explicit
# href in a doc uses something that redirects, etc.
my @refs = `cat $realfile | /usr/local/bin/get_refs.pl "$url"`;
foreach my $ref (@refs) {
chomp $ref;
if (exists($byurl{$ref})) {
push @{$linktome[$byurl{$ref}]},$j;
$linksout[$j]++;
$all++;
}
}
}
push(@sinks, $j) if ($linksout[$j]==0);
}
print "Processed $N Irish documents ($htmldocs in HTML); found $all Irish-to-Irish links...\n";
# need to add slosh b/c documents sometimes get 'dockill'ed, which
# substracts their probabilities from the total mass...
my $slosh=(1-$prtotal)/$N;
$pr[$_] += $slosh for (0..$end);
print "Added $slosh to each to ensure a probability measure...\n";
# Now compute page rank
print "Beginning one iteration of page rank calculation...\n";
my @newpr;
for my $i (0..$end) {
print "row $i/$end\n" if ($i % 100 == 0);
$newpr[$i]=0;
for my $j (0..$end) {
$newpr[$i] += $pr[$j]*$q/$N;
}
for my $j (@sinks) {
$newpr[$i] += $pr[$j]*(1-$q)/$N;
}
for my $j (@{$linktome[$i]}) {
$newpr[$i] += $pr[$j]*(1-$q)/$linksout[$j];
}
}
open(SCR, ">", "./scr") or die "Could not open output script: $!\n";
for my $i (0..$end) {
print SCR "echo $i\n" if ($i % 100 == 0);
print SCR "sed -i '/^pagerank/s/.*/pagerank: $newpr[$i]/' $base/sonrai/$docIds[$i].dat\n";
}
close SCR;
exit 0;