-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdatahog_crawler.py
105 lines (86 loc) · 2.73 KB
/
datahog_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import sys
import hashlib
import getopt
import pickle
import datetime
help_msg = '''
Usage:
python datahog_crawler.py <root path> [options]
Options:
-n --no-checksums Do not calculate checksums for files (much faster)
-o --output Specify an output file
'''
if len(sys.argv) < 2:
print(help_msg)
sys.exit()
root_path = os.path.abspath(sys.argv[1])
output_path = '{}.datahog'.format(os.path.basename(root_path))
gen_checksums = True
has_checksums = gen_checksums
files = []
problem_files = []
try:
opts, args = getopt.getopt(sys.argv[2:], 'o:n', ['output=', 'no-checksums'])
except getopt.GetoptError as err:
print(err)
sys.exit(0)
for o, a in opts:
if o in ('-n', '--no-checksums'):
gen_checksums = False
elif o in ('-o', '--output'):
output_path = a
if not os.path.isdir(os.path.dirname(os.path.abspath(output_path))):
print('Failed: "{}" is not a valid output path.'.format(output_path))
sys.exit()
for dirpath, dirnames, filenames in os.walk(root_path):
for fname in filenames:
path = '{}/{}'.format(dirpath, fname)
try:
created = os.path.getctime(path)
size = os.path.getsize(path)
except:
problem_files.append(path)
continue
if gen_checksums:
try:
with open(path, 'rb') as f:
data = f.read()
checksum = hashlib.md5(data).hexdigest()
except:
has_checksums = False
checksum = None
else:
checksum = None
files.append({
'path': path,
'checksum': checksum,
'created': created,
'size': size
})
sys.stdout.write('\rScanned {} files'.format(len(files)))
sys.stdout.flush()
if not len(files):
sys.stdout.write('\rFailed: No files found in "{}".\n'.format(sys.argv[1]))
sys.exit(0)
obj = {
'format': 'datahog:0.1',
'root': root_path,
'type': 'Local folder',
'date_scanned': datetime.datetime.now().timestamp(),
'files': files,
'has_checksums': has_checksums
}
with open(output_path, 'wb') as outfile:
pickle.dump(obj, outfile)
if len(problem_files) > 10:
problem_path = 'datahog_problem_files.txt'
with open(problem_path, 'w') as pfile:
for path in problem_files:
pfile.write('{}\n'.format(path))
print('\n\nEncountered a problem reading {} files (list saved to {})'.format(len(problem_files), problem_path))
elif len(problem_files) > 0:
print('\n\nEncountered a problem reading the following files:')
for file in problem_files:
print(file)
print('\nSaved output to {}'.format(output_path))