forked from repology/repology-updater
-
Notifications
You must be signed in to change notification settings - Fork 0
/
repology-linkchecker.py
executable file
·212 lines (169 loc) · 7.84 KB
/
repology-linkchecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python3
#
# Copyright (C) 2016-2017 Dmitry Marakasov <[email protected]>
#
# This file is part of repology
#
# repology is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# repology is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with repology. If not, see <http://www.gnu.org/licenses/>.
import argparse
import multiprocessing
import os
import re
import socket
import time
import urllib.parse
import requests
import repology.config
from repology.database import Database
from repology.logger import FileLogger, StderrLogger
def GetHTTPLinkStatus(url, timeout):
try:
response = requests.head(url, allow_redirects=True, headers={'user-agent': 'Repology link checker/0'}, timeout=timeout)
# fallback to GET
if response.status_code != 200:
response = requests.get(url, allow_redirects=True, headers={'user-agent': 'Repology link checker/0'}, timeout=timeout)
redirect = None
size = None
location = None
# handle redirect chain
if response.history:
redirect = response.history[0].status_code
# resolve permanent (and only permament!) redirect chain
for h in response.history:
if h.status_code == 301:
location = urllib.parse.urljoin(h.url, h.headers.get('location'))
# handle size
if response.status_code == 200:
content_length = response.headers.get('content-length')
if content_length:
size = int(content_length)
return (url, response.status_code, redirect, size, location)
except KeyboardInterrupt:
raise
except requests.Timeout:
return (url, Database.linkcheck_status_timeout, None, None, None)
except requests.TooManyRedirects:
return (url, Database.linkcheck_status_too_many_redirects, None, None, None)
except requests.ConnectionError:
# check for DNS error additionally
try:
parsed = urllib.parse.urlparse(url)
socket.gethostbyname(parsed.hostname)
except socket.gaierror:
return (url, Database.linkcheck_status_dns_error, None, None, None)
except:
pass
return (url, Database.linkcheck_status_cannot_connect, None, None, None)
except requests.exceptions.InvalidURL:
return (url, Database.linkcheck_status_invalid_url, None, None, None)
except:
return (url, Database.linkcheck_status_unknown_error, None, None, None)
def GetLinkStatuses(urls, delay, timeout):
results = []
prev_host = None
for url in urls:
# XXX: add support for gentoo mirrors, skip for now
if not url.startswith('http://') and not url.startswith('https://'):
continue
host = urllib.parse.urlparse(url).hostname
if host and host == prev_host:
time.sleep(delay)
results.append(GetHTTPLinkStatus(url, timeout))
prev_host = host
return results
def LinkProcessorWorker(queue, workerid, options, logger):
database = Database(options.dsn, readonly=False)
logger = logger.GetPrefixed('worker{}: '.format(workerid))
logger.Log('Worker spawned')
while True:
pack = queue.get()
if pack is None:
logger.Log('Worker exiting')
return
logger.Log('Processing {} urls ({}..{})'.format(len(pack), pack[0], pack[-1]))
for result in GetLinkStatuses(pack, delay=options.delay, timeout=options.timeout):
url, status, redirect, size, location = result
database.UpdateLinkStatus(url=url, status=status, redirect=redirect, size=size, location=location)
database.Commit()
logger.Log('Done processing {} urls ({}..{})'.format(len(pack), pack[0], pack[-1]))
def Main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dsn', default=repology.config.DSN, help='database connection params')
parser.add_argument('--logfile', help='path to log file (log to stderr by default)')
parser.add_argument('--timeout', type=float, default=60.0, help='timeout for link requests in seconds')
parser.add_argument('--delay', type=float, default=3.0, help='delay between requests to one host')
parser.add_argument('--age', type=int, default=0, help='min age for recheck in days')
parser.add_argument('--packsize', type=int, default=128, help='pack size for link processing')
parser.add_argument('--maxpacksize', type=int, help='max pack size for link processing (useful to skip large hosts)')
parser.add_argument('--jobs', type=int, default=1, help='number of parallel jobs')
parser.add_argument('--unchecked', action='store_true', help='only process unchecked (newly discovered) links')
parser.add_argument('--checked', action='store_true', help='only process old (already checked) links')
parser.add_argument('--failed', action='store_true', help='only process links that were checked and failed')
parser.add_argument('--succeeded', action='store_true', help='only process links that were checked and failed')
parser.add_argument('--prefix', help='only process links with specified prefix')
options = parser.parse_args()
logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
database = Database(options.dsn, readonly=True, autocommit=True)
queue = multiprocessing.Queue(1)
processpool = [multiprocessing.Process(target=LinkProcessorWorker, args=(queue, i, options, logger)) for i in range(options.jobs)]
for process in processpool:
process.start()
# base logger already passed to workers, may append prefix here
logger = logger.GetPrefixed('master: ')
prev_url = None
while True:
# Get pack of links
logger.Log('Requesting pack of urls')
urls = database.GetLinksForCheck(
after=prev_url,
prefix=options.prefix, # no limit by default
limit=options.packsize,
recheck_age=options.age * 60 * 60 * 24,
unchecked_only=options.unchecked,
checked_only=options.checked,
failed_only=options.failed,
succeeded_only=options.succeeded
)
if not urls:
logger.Log(' No more urls to process')
break
# Get another pack of urls with the last hostname to ensure
# that all urls for one hostname get into a same large pack
match = re.match('([a-z]+://[^/]+/)', urls[-1])
if match:
urls += database.GetLinksForCheck(
after=urls[-1],
prefix=match.group(1),
recheck_age=options.age * 60 * 60 * 24,
unchecked_only=options.unchecked,
checked_only=options.checked,
failed_only=options.failed,
succeeded_only=options.succeeded
)
# Process
if options.maxpacksize and len(urls) > options.maxpacksize:
logger.Log('Skipping {} urls ({}..{}), exceeds max pack size'.format(len(urls), urls[0], urls[-1]))
else:
queue.put(urls)
logger.Log('Enqueued {} urls ({}..{})'.format(len(urls), urls[0], urls[-1]))
prev_url = urls[-1]
logger.Log('Waiting for child processes to exit')
for process in processpool:
queue.put(None)
for process in processpool:
process.join()
logger.Log('Done')
return 0
if __name__ == '__main__':
os.sys.exit(Main())