-
Notifications
You must be signed in to change notification settings - Fork 0
/
Geonameslist.pl
executable file
·88 lines (64 loc) · 2.33 KB
/
Geonameslist.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/perl
# Parse XML dump and create a CSV of articles with geonames set.
# Copyright (C) User:Fluff 2017
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
use strict;
use warnings;
use XML::LibXML::Reader;
use DBI;
use IO::Handle;
my $dbh = DBI->connect("dbi:mysql:mysql_read_default_file=/data/project/perfectbot/.my.cnf;host=svwiki.analytics.db.svc.wikimedia.cloud;database=svwiki_p;mysql_read_timeout=3600", undef, undef, {RaiseError => 1, AutoCommit => 1});
my $sth = $dbh->prepare(qq!SELECT * FROM categorylinks WHERE cl_from = ? AND cl_to LIKE ?!);
my $r = XML::LibXML::Reader->new(FD => fileno(STDIN));
open(F, ">/data/project/perfectbot/Fluffbot/geonameslist.txt") || die("Could not open file: $!");
F->autoflush(1);
#print F "Title\tGeonamesid\tCountry\tType\n";
my $i = 0;
while($r->nextElement('page')) {
my $title;
my $text;
my $id;
my $ns;
$i++;
warn "Scanned $i pages" if($i % 10000 == 0);
if($r->nextElement('title')) {
$title = $r->readInnerXml();
}
if($r->nextElement('ns')) {
$ns = $r->readInnerXml();
}
if($r->nextElement('id')) {
$id = $r->readInnerXml();
}
if($r->nextElement('text')) {
$text = $r->readInnerXml();
}
if($title && $text && $id && $ns == 0) {
my $geonames;
my $country;
my $type;
$sth->execute($id, 'Wikipedia:Artiklar_med_geonames-parameter_utan_P1566%');
if($sth->rows()) {
$text =~ /\|\ *1\ *\=\ *([^\n|^\|]+)/;
$type = $1;
$text =~ /\|\ *geonames\ *\=\ *([^\n|^\|]+)/;
$geonames = $1;
$geonames =~ s/(\ |\t)+$//g;
$text =~ /\|\ *country\ *\=\ *([^\n|^\|]+)/;
$country = $1;
$country =~ s/\[\[//g;
$country =~ s/\]\]//g;
warn "$title";
print F "$title\t$geonames\t$country\t$type\n";
}
}
}