-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrate_inspector.py
executable file
·460 lines (364 loc) · 14.5 KB
/
crate_inspector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#!/usr/bin/python3 -u
import argparse
from collections import defaultdict
import csv
import hashlib
import io
import json
import re
import os
import semver
import subprocess
import sys
import tarfile
import tempfile
import time
import urllib.request
csv.field_size_limit(1024 * 1024)
def try_int(x):
try:
return int(x)
except:
return x
def crates_csv_extract(row):
""" return a tuple (downloads, id, name, repository) """
return (try_int(row[3]), try_int(row[5]), row[7], row[9])
def versions_csv_extract(row):
""" return a tuple (crate_id, version) """
return (try_int(row[0]), row[7])
SEMVER_PLACEHOLDER = semver.VersionInfo.parse('0.0.0')
def try_semver(s):
""" try to parse a semver string """
try:
return semver.VersionInfo.parse(s)
except:
# The only thing we need semver structs for is to get the sort
# order correct; for strings that don't parse as semver we'll just
# return a placeholder value.
# Returning the original string here almost works, except that when
# we do the version sort later semver will try again to parse the
# string and raise an exception.
return SEMVER_PLACEHOLDER
def latest_version(vlist):
""" Sort a list of (semver, string) version tuples.
The first element is used to get the sort order correct, while the second
is returned once sorting is complete.
"""
vlist = sorted(vlist, reverse=True)
return vlist[0][1]
def download_crate(name, version):
url = f'https://static.crates.io/crates/{name}/{name}-{version}.crate'
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
if e.code == 403:
print(f'HTTP 403 (Forbidden) error.')
return None
raise
return response.read()
class CrateTarball:
def __init__(self, data):
filelike = io.BytesIO(data)
self.tarball = tarfile.open(fileobj=filelike, mode='r:gz')
def extract_crate_meta(self):
""" Extract the cargo_vcs_info data from the crates.io tarball. """
for member in self.tarball.getmembers():
if member.name.endswith('.cargo_vcs_info.json'):
json_data = self.tarball.extractfile(member).read()
return json.loads(json_data)
print('ERROR: no .cargo_vcs_info file in crate tarball')
def examine_files(self, filename_pattern):
""" Provide an iterator over files that match a particular pattern.
The filename_pattern is in python regex format.
Returns (filename, file_reader)
"""
for member in self.tarball.getmembers():
if re.match(filename_pattern, member.name):
#print(f'*** examine file {member.name}')
yield (member.name, self.tarball.extractfile(member))
class CratesDbDump:
""" This extracts a crate/version listing from the crates.io database dump. """
def __init__(self, filename):
tarball = tarfile.open(filename)
for member in tarball.getmembers():
if member.name.endswith('crates.csv'):
tar_reader = tarball.extractfile(member)
text_reader = io.TextIOWrapper(tar_reader)
self.load_crates(text_reader)
elif member.name.endswith('versions.csv'):
tar_reader = tarball.extractfile(member)
text_reader = io.TextIOWrapper(tar_reader)
self.load_versions(text_reader)
assert self.crates
assert self.versions
def load_crates(self, reader):
data = csv.reader(reader)
# crates.csv:
# 0 1 2 3 4 5 6 7 8 9 10
# created_at,description,documentation,downloads,homepage,id,max_upload_size,name,readme,repository,updated_at
header = crates_csv_extract(data.__next__())
assert header == ('downloads', 'id', 'name', 'repository')
crates = []
for row in data:
crates.append(crates_csv_extract(row))
# Do a reverse-sort so that the most popular crates come first.
crates.sort(reverse=True)
self.crates = crates
def load_versions(self, reader):
data = csv.reader(reader)
# versions.csv:
# 0 1 2 3 4 5 6 7 8 9 10
# crate_id,crate_size,created_at,downloads,features,id,license,num,published_by,updated_at,yanked
header = versions_csv_extract(data.__next__())
assert header == ('crate_id', 'num')
versions = defaultdict(list)
for row in data:
crate_id, version_string = versions_csv_extract(row)
vers = try_semver(version_string)
versions[crate_id].append((vers, version_string))
self.versions = versions
class Git:
def __init__(self):
self.repo_dir = tempfile.TemporaryDirectory()
self.env = my_env = os.environ.copy()
self.env['GIT_TERMINAL_PROMPT'] = "0"
def run(self, *args, **kwargs):
try:
kwargs['cwd'] = self.repo_dir.name
kwargs['stdin'] = subprocess.DEVNULL
kwargs['env'] = self.env
if 'check' not in kwargs:
kwargs['check'] = True
return subprocess.run(*args, **kwargs)
except subprocess.CalledProcessError as e:
print(f'git error: {e}')
raise
@staticmethod
def hash_blob(data):
""" Return one or more git-style hashes for a blob of data.
Because git may handle line-endings different ways, we return
one or two hashes; first we hash the file as-is, then if the file
contains any `\r\n` patterns we hash it again with those converted
to `\n`.
Returns a list of sha1 hashes (each hash as a hex string).
"""
def hashit(data):
stuff = b'blob ' + str(len(data)).encode() + b'\0'
return hashlib.sha1(stuff + data).hexdigest()
hashes = [hashit(data)]
if b'\r\n' in data:
data2 = data.replace(b'\r\n', b'\n')
hashes.append(hashit(data2))
return hashes
def blob_exists(self, blob_hash):
""" Returns True if the blob exists in this repo. """
result = self.run(['git', 'cat-file', '-e', blob_hash], check=False)
return result.returncode == 0
@staticmethod
def fixup_url(url):
""" Try to derive a git-clone-able URL from the specified URL.
Some projects put a github URL in their crate metadata that only
works in a web browser. We can derive a git-compatible url by
stripping off the '/tree/branchname/etc' suffix.
"""
# Note that '\w+(?:-\w+)+' captures a word that can contain hypens.
m = re.match(r'(https://github.com/[\w\-_]+/[\w\-_]+)/tree/', url)
if m:
return m.groups()[0]
else:
return url
def clone_full(self, url):
""" Clone a repo. """
url = self.fixup_url(url)
self.run(['git', 'clone', '-q', '--bare', url, '.'])
def clone_shallow(self, url, commit_hash):
""" Shallow-clone a repo, then find the tags associated with a hash. """
url = self.fixup_url(url)
self.run(['git', 'init', '-q', '.'])
self.run(['git', 'remote', 'add', 'origin', url])
self.run(['git', 'fetch', '-q', '-t',
'--depth=1', 'origin', commit_hash])
def get_tags(self, commit_hash):
""" Find the tags associated with a commit.
Returns a list of strings (one for each tag).
"""
result = self.run(['git', 'tag', '--points-at',
commit_hash], capture_output=True)
tags = result.stdout.split()
# Parse bytes to str.
tags = [t.decode() for t in tags]
return tags
class Verifier:
""" Do all the verification steps on a crate. """
def __init__(self, crate_name, crate_version, repo_url):
self.repo = None
self.commit_hash = None
self.crate_name = crate_name
self.crate_version = crate_version
self.repo_url = repo_url
def print(self, msg):
print(f'{self.crate_name} {self.crate_version} {msg}')
def check_url(self):
if self.repo_url:
self.print(f'repo url is {self.repo_url}')
return True
else:
self.print(f'ERROR: invalid repo url: "{self.repo_url}"')
return False
def download(self):
""" Try to download the crate source from crates.io .
We don't write it to a file, but instead examine it in-memory.
Returns True on success; False on a permissions error.
May raise exceptions on other errors (e.g. http/connectivity problems).
"""
try:
tarball_data = download_crate(self.crate_name, self.crate_version)
if not tarball_data:
self.print(f'ERROR: permanent crate download failure')
return False
except Exception as e:
self.print(f'ERROR: failed to download crate: {e}')
# If crates.io is unreachable, then we should just stop the script.
raise
try:
self.tarball = CrateTarball(tarball_data)
except Exception:
self.print(f'ERROR: failed to extract crate tarball')
raise
return True
def match_tags(self):
""" Examine a list of tags to see if one matches this version.
The tag formats we expect are:
- 1.2.3
- v1.2.3
- cratename-1.2.3
- cratename-v1.2.3
returns True if a tag match is found, False otherwise.
"""
try:
tags = self.repo.get_tags(self.commit_hash)
except Exception as e:
self.print(f'ERROR: failed to read tags: {e}')
def try_match(tag):
if tag in tags:
self.print(f'commit-hash matches tag "{tag}"')
return True
return False
nn = self.crate_name
vv = self.crate_version
success = try_match(self.crate_version) or try_match(
f'v{vv}') or try_match(f'{nn}-{vv}') or try_match(f'{nn}-v{vv}')
if not success:
self.print(f'ERROR: tag match fail: {tags}')
return success
def clone_shallow(self):
""" Try to extract the crates.io scm hash, and then shallow-clone the repo. """
try:
meta = self.tarball.extract_crate_meta()
self.commit_hash = meta['git']['sha1']
except Exception:
self.print(f'ERROR: failed to extract crate metadata')
raise
self.print(f'scm hash {self.commit_hash}')
try:
repo = Git()
repo.clone_shallow(self.repo_url, self.commit_hash)
self.repo = repo
except Exception as e:
self.print(f'ERROR: failed to shallow-clone repo: {e}')
raise
def clone_full(self):
try:
self.print(f'doing full repo clone...')
repo = Git()
repo.clone_full(self.repo_url)
self.repo = repo
except Exception as e:
self.print(f'ERROR: failed to clone repo: {e}')
raise
def search_blobs(self):
""" Returns True if all files exist in the repo. """
file_count = 0
files_unmatched = 0
for filename, file_reader in self.tarball.examine_files('.*\.rs$'):
blob_hashes = self.repo.hash_blob(file_reader.read())
blob_exists = False
for blob_hash in blob_hashes:
if self.repo.blob_exists(blob_hash):
blob_exists = True
break
file_count += 1
if not blob_exists:
self.print(f'ERROR: file not in repo: {filename}')
files_unmatched += 1
self.print(
f'files checked: {file_count} unmatched files: {files_unmatched}')
if files_unmatched:
self.print(f'ERROR: {files_unmatched} files not found in repo')
return files_unmatched == 0
# Download from https://static.crates.io/db-dump.tar.gz
DBDUMPFILE_DEFAULT = 'db-dump.tar.gz'
def parse_range(val):
""" Parse a number or a numeric range, e.g. '7' or '0-10'. """
vals = val.split(sep='-', maxsplit=1)
if len(vals) == 1:
x = int(vals[0])
return (x, x + 1)
if len(vals) == 2:
return (int(vals[0]), int(vals[1]))
def do_verify(db, crate_db_row):
crate_id = crate_db_row[1]
crate_name = crate_db_row[2]
vers = db.versions[crate_id]
latest = latest_version(vers)
url = crate_db_row[3]
verifier = Verifier(crate_name, latest, url)
verifier.print(f'has {crate_db_row[0]} downloads')
if not verifier.check_url():
# Without a valid repo URL, there's nothing else we can do.
return
if not verifier.download():
# If crates.io returns a permissions error,
# there's nothing more we can do.
return
try:
verifier.clone_shallow()
verifier.match_tags()
if not verifier.search_blobs():
# Hacky way of forcing the clone_full to execute.
raise Exception
except Exception:
print('shallow match failed.')
try:
verifier.clone_full()
verifier.search_blobs()
except Exception:
# Clone failed, nothing more we can do.
return
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--dbdumpfile', default=DBDUMPFILE_DEFAULT)
parser.add_argument('--rank', type=parse_range, default=(0, 100))
parser.add_argument('--crate', help='crate name to inspect')
args = parser.parse_args()
db = CratesDbDump(args.dbdumpfile)
print(f'There are {len(db.crates)} crates in this database dump.')
if args.crate:
# Search for the crate by name
for crate_db_row in db.crates:
crate_name = crate_db_row[2]
if crate_name == args.crate:
do_verify(db, crate_db_row)
return
print(f'failed to find a crate named "{args.crate}"')
sys.exit(1)
# Examine crates by their ranking (by number of downloads)
rank_start, rank_end = args.rank
for index, crate_db_row in enumerate(db.crates[rank_start:rank_end], start=rank_start):
if index > rank_start:
# Time delay between steps, to honor the crates.io crawling policy.
time.sleep(2)
print(f'ranking: {index}')
do_verify(db, crate_db_row)
if __name__ == '__main__':
main()