-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimilar_img.py
760 lines (642 loc) · 35 KB
/
similar_img.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
# -*- coding: utf-8 -*-
# The MIT License (MIT)
# Copyright (c) 2019 [email protected]
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import print_function #to make python2 print color like python3 do
__author__ = 'Lim Kok Hole'
__copyright__ = 'Copyright 2019'
__credits__ = [
'Dr. Neal Krawetz at http://www.hackerfactor.com/about.php', #research of average/difference hash
'David Oftedal at http://01101001.net/', #average/difference hash code (without comparision metric)
'Sastanin at https://stackoverflow.com/users/25450/sastanin' #Manhattan/zero norm code (without resize and comparison metric)
]
__license__ = 'MIT'
__version__ = 1.1
__cache_file_version__ = 1.1 #only incresae this if cache file structure change
__maintainer__ = 'Lim Kok Hole'
__email__ = '[email protected]'
__status__ = 'Production'
import sys
import os
import time
import uuid
import traceback
from PIL import Image
# Python 2 and 3: backward-compatible
from past.builtins import xrange
# Not going to use this since 3.5's walk() already default to scandir
# https://github.com/benhoyt/scandir
#try: # python 3
# from os import scandir
#except ImportError:
# from scandir import scandir
try:
import readline # allow edit input by LEFT key
except ImportError:
print("Module readline not available.")
else:
# import rlcompleter
readline.set_completer_delims(' \t\n=')
readline.parse_and_bind("tab: complete")
import argparse
from argparse import RawTextHelpFormatter #make --help's \n works and indentation pretty
arg_parser = argparse.ArgumentParser(
# don't specify prefix_chars='-+/' here which causes / in path is not option value
description='Find simlar images in specific directory by specific image', formatter_class=RawTextHelpFormatter)
args = ""
PY3 = sys.version_info[0] >= 3
if PY3:
pass
else:
input = raw_input
OS_SEP = os.sep
RESIZE_X = 0 #256
RESIZE_Y = 0 #128
def quit(msgs, exit=True):
if not isinstance(msgs, list):
msgs = [msgs]
for msg in msgs:
print('\x1b[1;41m%s\x1b[0m\x1b[K' % msg)
if exit:
print('\x1b[1;41m%s\x1b[0m\x1b[K' % 'Abort.')
sys.exit()
from PIL import ImageStat
#Research by http://www.hackerfactor.com/blog/?/archives/432-Looks-Like-It.html
#Hash code by http://01101001.net/DifferenceHash.py and http://01101001.net/AverageHash.py
#hole: but I decided combine both and pick higher of them as "matched" metric.
def AverageHash(theImage):
# Convert the image to 8-bit grayscale.
theImage = theImage.convert("L") # 8-bit grayscale
# Squeeze it down to an 8x8 image.
theImage = theImage.resize((8,8), Image.ANTIALIAS)
# Calculate the average value.
averageValue = ImageStat.Stat(theImage).mean[0]
# Go through the image pixel by pixel.
# Return 1-bits when the tone is equal to or above the average,
# and 0-bits when it's below the average.
averageHash = 0
for row in xrange(8):
for col in xrange(8):
averageHash <<= 1
averageHash |= 1 * ( theImage.getpixel((col, row)) >= averageValue)
return averageHash
def DifferenceHash(theImage):
# Convert the image to 8-bit grayscale.
theImage = theImage.convert("L") # 8-bit grayscale #orig
# [todo:0] some files warning:
# /home/xiaobai/.local/lib/python3.6/site-packages/PIL/Image.py:993: UserWarning
# : Palette images with Transparency expressed in bytes should be converted to RGBA images
#theImage = theImage.convert("1") # 8-bit grayscale # hole
# Squeeze it down to an 8x8 image.
theImage = theImage.resize((8, 8), Image.ANTIALIAS) #orig
#theImage = theImage.resize((64, 64), Image.ANTIALIAS) # hole
# Go through the image pixel by pixel.
# Return 1-bits when a pixel is equal to or brighter than the previous
# pixel, and 0-bits when it's below.
# Use the 64th pixel as the 0th pixel.
previousPixel = theImage.getpixel((0, 7))
differenceHash = 0
for row in xrange(0, 8, 2):
# Go left to right on odd rows.
for col in xrange(8):
differenceHash <<= 1
pixel = theImage.getpixel((col, row))
differenceHash |= 1 * (pixel >= previousPixel)
previousPixel = pixel
row += 1
# Go right to left on even rows.
for col in xrange(7, -1, -1):
differenceHash <<= 1
pixel = theImage.getpixel((col, row))
differenceHash |= 1 * (pixel >= previousPixel)
previousPixel = pixel
return differenceHash
def share_print(img_f_hash_diff, img_f_hash_avg, dir_img_f_hash_diff, dir_img_f_hash_avg, path, ei, total_matched):
if args.verbose:
print('dir_img_f_hash_diff: ' + str(dir_img_f_hash_diff))
print('dir_img_f_hash_avg: ' + str(dir_img_f_hash_avg))
print('img_f_hash_diff: ' + str(img_f_hash_diff))
print('img_f_hash_avg: ' + str(img_f_hash_avg))
d_diff = ((64 - bin(img_f_hash_diff ^ dir_img_f_hash_diff).count("1"))*100.0)/64.0 #orig
d_avg = ((64 - bin(img_f_hash_avg ^ dir_img_f_hash_avg).count("1"))*100.0)/64.0 #orig
#d = ((4096 - bin(img_f_hash ^ dir_img_f_hash).count("1"))*100.0)/4096.0 # hole
#d_both = ((d_diff+d_avg)/2.0) #no longer print both avg since it's misleading
# ... due to current comparision metric is "pick higher of both"
if args.verbose:
print('img percentage diff: ' + str(d_diff))
print('img percentage avg: ' + str(d_avg))
print('arg percentage: ' + str(args.percentage))
# no need update cache file if want to test this since cache store the original hash, not the both hash
#if d_both >= args.percentage:
if (d_diff >= args.percentage) or (d_avg >= args.percentage):
if d_diff >= args.percentage: #note that this doesn't means avg not match, only means first match
print('[' + str(ei) + '] \x1b[1;35m' + path + '\x1b[0m\x1b[K \x1b[1;32m[Matched]\x1b[0m\x1b[K diff: ' + str(d_diff) + '%' + ' avg: ' + str(d_avg) + '%' )
else:
print('[' + str(ei) + '] \x1b[1;35m' + path + ' \x1b[0m\x1b[K diff: ' + str(d_diff) + '%' + ' \x1b[1;32m[Matched]\x1b[0m\x1b[K avg: ' + str(d_avg) + '%' )
# total_matched not increment yet, so +1
if args.show is not None:
if (args.show == 0) or ((total_matched+1) <= args.show):
try:
Image.open(path).show()
except OSError:
print('\x1b[1;41m-s failed. Probably file no longer exists\x1b[0m\x1b[K: ' + path)
return True
else:
if not args.m:
print('[' + str(ei) + '] \x1b[1;35m' + path + '\x1b[0m\x1b[K \x1b[1;31m[Not Match]\x1b[0m\x1b[K diff: ' + str(d_diff) + '%' + ' avg: ' + str(d_avg) + '%' )
return False
def diff_file(path, hash_cache_f, write_once, dir_path, hash_type):
dir_img_f = Image.open(path)
dir_img_f.load()
dir_img_f_hash_diff = DifferenceHash(dir_img_f)
dir_img_f_hash_avg = AverageHash(dir_img_f)
if args.w is not False:
if not write_once:
dir_full_path = os.path.abspath(os.path.expanduser(os.path.expandvars(dir_path)))
path_total_line = dir_full_path.count('\n') + 1
hash_cache_f.writelines( str(__cache_file_version__) + '\n')
hash_cache_f.writelines( str(path_total_line) + '\n')
hash_cache_f.writelines(dir_full_path + '\n') #store directory path on 1st line
write_once = True
path_total_line = path.count('\n') + 1
hash_cache_f.writelines( str(path_total_line) + '\n')
hash_cache_f.writelines(path + '\n')
# no nid worry str() truncate, it's not float
hash_cache_f.writelines(str(dir_img_f_hash_diff) + '\n')
hash_cache_f.writelines(str(dir_img_f_hash_avg) + '\n')
if args.verbose:
print('hash diff: ' + str(dir_img_f_hash_diff))
print('hash avg: ' + str(dir_img_f_hash_avg))
# [todo:0] need .close() or not ? which faster ?
# dir_img_f.close()
return (dir_img_f_hash_diff, dir_img_f_hash_avg, write_once)
#shared by args.ln and args.lnm
def create_symlink(symlink_dir_path, path):
if symlink_dir_path:
try:
# path already full path by abspath(), and symlink target need full path
if args.unique_ln:
os.symlink( path, os.path.join(symlink_dir_path, str(uuid.uuid4()) + '_' + os.path.basename(path)) )
else:
os.symlink( path, os.path.join(symlink_dir_path, os.path.basename(path)) )
except OSError:
print('Failed to create symlink: ' + path)
if args.verbose:
print('This error is normal if file already exists.')
def to_grayscale(arr):
"If arr is a color image (3D array), convert it to grayscale (2D array)."
if len(arr.shape) == 3:
return average(arr, -1) # average over the last axis (color channels)
else:
return arr
def normalize(arr):
#[todo:1/2] move on top same ?
amin = arr.min()
rng = arr.max() - amin #arr.min()
# https://gist.github.com/astanin/626356#gistcomment-2568216
# prevent divide by 0 error
# hole: can generate test image by `convert -size 128x256 xc:white empty128x256_white.jpg`
#, to reproduce since all items' value is same and max() - min() will produce rng as 0 here
#, which causes all values is nan and impossible match by float metric, even though no exception
if rng == 0:
rng = 1
#amin = arr.min()
return (arr - amin) * 255 / rng
def compare_images(img_f_np_ndarray, dir_img_f_np_ndarray):
# calculate the difference and its norms
diff = img_f_np_ndarray - dir_img_f_np_ndarray # elementwise for scipy arrays
m_norm = sum( abs(diff) ) # Manhattan norm #hole: seems like impossible -, so abs() useless here
#z_norm = norm( diff.ravel(), 0 ) # Zero norm
#return (m_norm, z_norm)
return m_norm
if __name__ == "__main__":
arg_parser.add_argument('-d', '--dir', dest='dir_to_scan', help='Directory path to scan recursively.\
\nYou can\'t use -d together with -c')
arg_parser.add_argument('-md', '--max-depth', dest='max_depth', type=int, help='Specify max depth of directory path to scan.\
\nIf you use -md together with -c then its max depth depends on the path of cache file without re-scan.')
arg_parser.add_argument('-v', '--verbose', action='store_true', help='Shows more log.\
\nYou probably want to remove -m too.')
arg_parser.add_argument('-n', '--norm', dest='norm', action='store_true'
, help='Use Manhattan norm which not allow cache.\
\nIt has higher tolerance in different size but lower tolerance in bright.\
\nDefault are Average and Difference hash(pick higher) which allow cache with -c/-w.')
arg_parser.add_argument('-m', action='store_true',
help='Only shows matched line.')
arg_parser.add_argument('-c', nargs='?', default=False, help='Read hash values from this cache file without re-scan images. Default is hash.cache if -c without value.\
\nYou should use -w once to generate new cache file before use -c')
arg_parser.add_argument('-w', nargs='?', default=False,
help='Write hash values to this cache file for re-use by -c. Default is hash.cache if -w without value.')
arg_parser.add_argument('-s', '--show', type=int, help='Shows Nth matched images in popup viewer.\
\n0 means unlimited but be aware it probably hang your system if too much viewers popup.')
arg_parser.add_argument('-l', '--link-match', dest='ln', help='Create symlink of matched images in this directory.')
arg_parser.add_argument('-lnm', '--link-not-match', dest='lnm', help='Create symlink of not-matched images in this directory.')
arg_parser.add_argument('-f', '--follow-symlink', dest='follow_symlink', action='store_true'
, help='Follow symlink for files and directory.\
\nDefault is don\'t follow symlink to avoid scan duplicated (-s will popup twice) files in -l/-lnm directory.')
arg_parser.add_argument('-u', '--unique-ln', dest='unique_ln', action='store_true'
, help='Create symlink with unique filename by uuid.\
\nThis option do nothing without -l or -lnm')
arg_parser.add_argument('-p', '--percentage', default=80.0, type=float
, help='Match by diff less or equal to this percentage floating value (without %%).\
\nDefault is 80.0%%. You should try lower it to 75%% or 70%% if it doesn\'t match a few.')
arg_parser.add_argument('image_path', nargs='?', help='Specify single image path to check.')
arg_parser.add_argument('--version', action='version', version='%(prog)s ' + str(__version__))
args, remaining = arg_parser.parse_known_args() #--version will stop here
if remaining:
quit('Redundant arguments: ' + ' '.join(remaining))
if args.dir_to_scan and (args.c is not False):
quit('You can\'t use -d together with -c')
# Need exclude -c without value means is None and -c with value
if (args.c is not False) and (args.w is not False):
# so need absence of -c means is False and its reverse "is not False" here
# -c without value -> None
# no -c -> False
quit('You can\'t use -c together with -w')
if args.norm and ( (args.c is not False) or (args.w is not False) ):
quit('You can\'t use -n together with -w/-c')
if args.image_path:
img_path = args.image_path
else:
if PY3:
img_path = input('Image path/name: ')
else:
img_path = input('Image path/name: '.encode('utf-8'))
if not img_path: # don't strip()
quit('Image path should not be empty. Abort.')
img_path = os.path.abspath(os.path.expanduser(os.path.expandvars(img_path)))
if args.c is False:
if args.dir_to_scan:
dir_path = args.dir_to_scan
else:
if PY3:
dir_path = input('Directory path/name ( "." for current directory): ')
else:
dir_path = input('Directory path/name ( "." for current directory): '.encode('utf-8'))
if not dir_path: # don't strip()
quit('Directory path should not be empty. Abort.')
# [1] abspath already act as normpath to remove trailing os.sep
# [2] expanduser expands ~
# [3] expandvars expands $HOME
dir_path = os.path.abspath(os.path.expanduser(os.path.expandvars(dir_path)))
if not os.path.exists(dir_path):
quit('Directory ' + dir_path + ' does not exist.')
elif not os.path.isdir(dir_path):
quit(dir_path + ' is not a directory.')
if args.norm:
# https://stackoverflow.com/a/3935002/1074998
# Modified(added resize and comparision metric) from https://gist.github.com/astanin/626356
# only import if use norm algo
import cv2
#from scipy.misc import imread
from scipy.linalg import norm
from scipy import sum, average
#import numpy as np #test cache .save/.load
# [todo:0] support alpha with non-alpha
# test case: vegito_black.jpg and vegito_by_al3x796-dakqk7k.png
img_f_np_ndarray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
#img_f_np_ndarray = cv2.imread(img_path, cv2.IMREAD_COLOR )
#img_f_np_ndarray = cv2.imread(img_path, cv2.IMREAD_REDUCED_GRAYSCALE_8)
try:
RESIZE_Y, RESIZE_X = img_f_np_ndarray.shape[:2] #(height, width)
except AttributeError:
quit('Probably the image path not exists:\n' + img_path)
if args.percentage > 100:
arg_percentage = 100
elif args.percentage < 0:
arg_percentage = 0
else:
arg_percentage = args.percentage
# [rule 1] to make manhattan comparision works, you should resize it to same image as source image
# , so source image no need resize, while the dir images must resize bsaed on the shape of source image
# [rule 2] since 100% means perfect match while 0.0 means perfect match in manhhattan norm
# , so we need 100 - args_percentage to make the metric "reverse".
# [rule 3] 255 is the maximum value of array_1_max(all 255) - array_2_min(all 0)
# , which depends on the resized shape. So we need mutiply them with percentage
# ... to get the final acceptable norm
acceptable_manhattan_diff = RESIZE_Y * RESIZE_X * 255 * (100 - arg_percentage) / 100.0
#hole: no nid check zero norm, AFAIS it's same
#acceptable_zero_diff = RESIZE_Y * RESIZE_X * (100 - arg_percentage) / 100.0
print('RESIZE_X: ' + str(RESIZE_X) + ' RESIZE_Y: ' + str(RESIZE_Y))
print('Acceptable percentage is: ' + str(arg_percentage))
print('Acceptable manhattan norm is: ' + str(acceptable_manhattan_diff))
#print('Acceptable zero norm is: ' + str(acceptable_zero_diff))
#height, width, channels = img_f_np_ndarray.shape
#hole: in case got channel, see https://stackoverflow.com/questions/19098104
#try:
# img_f_np_ndarray = cv2.resize(img_f_np_ndarray, (RESIZE_X, RESIZE_Y)) # hole
#except cv2.error:
# quit('Probably the image path not exists:\n' + img_path)
img_f_np_ndarray = to_grayscale(img_f_np_ndarray.astype(float))
# normalize to compensate for exposure difference
# print('before norm: ' + repr(img_f_np_ndarray))
img_f_np_ndarray = normalize(img_f_np_ndarray)
# print('after norm: ' + repr(img_f_np_ndarray))
else:
try:
img_f = Image.open(img_path)
img_f.load() # careful this is not use return result
img_f_hash_avg = AverageHash(img_f)
img_f_hash_diff = DifferenceHash(img_f)
if args.verbose:
print('hash diff: ' + str(img_f_hash_diff) + ' hash avg: ' + str(img_f_hash_avg))
except OSError: # python 2 no such FileNotFoundError, so need catch ancestor OSError
quit('No such image file exists. Abort.')
hash_cache = 'hash.cache'
if args.w is not False:
if args.w:
hash_cache = args.w
if args.verbose:
print('w filename: ' + hash_cache)
open(hash_cache, "w").close # empty the file
hash_cache_f = open(hash_cache, "a")
elif args.c is not False:
if args.c:
hash_cache = args.c
if args.verbose:
print('c filename: ' + hash_cache)
try:
hash_cache_f = open(hash_cache, "r")
except OSError:
quit('-c\'s file ' + hash_cache +
' doesn\'t exist. You should use -w once to generate cache file before use -c')
else:
hash_cache_f = None
ln_dir_path = None
if args.ln:
ln_dir_path = os.path.abspath(os.path.expanduser(os.path.expandvars(args.ln)))
if not os.path.exists(ln_dir_path):
try:
os.makedirs(ln_dir_path)
except OSError: # FileExistsError in python 3 has ancestor OSError of python 2
if args.verbose:
print('Symlink directory already exists.')
elif not os.path.isdir(ln_dir_path):
quit('The -ln ' + ln_dir_path + ' path already exists but not a directory.')
elif os.path.exists(ln_dir_path):
cont = input('\n-ln ' + ln_dir_path + ' directory already exists\n, continue to write to this directory ? [yes/no] : ')
if not cont or cont[0].lower() != 'y':
quit('')
lnm_dir_path = None
if args.lnm:
lnm_dir_path = os.path.abspath(os.path.expanduser(os.path.expandvars(args.lnm)))
if not os.path.exists(lnm_dir_path):
try:
os.makedirs(lnm_dir_path)
except OSError: # FileExistsError in python 3 has ancestor OSError of python 2
if args.verbose:
print('Symlink directory already exists.')
elif not os.path.isdir(lnm_dir_path):
quit('The -ln ' + lnm_dir_path + ' path already exists but not a directory.')
elif os.path.exists(lnm_dir_path):
cont = input('\n-ln ' + lnm_dir_path + ' directory already exists\n, continue to write to this directory ? [yes/no] : ')
if not cont or cont[0].lower() != 'y':
quit('')
total_matched = 0
total_not_match = 0
total_not_a_image = 0
max_depth = args.max_depth
if args.c is False or args.norm:
if args.verbose:
print('read from files')
ei = 0
# for ei, entry in enumerate(scandir(dir_path)): #scandir
write_once = False
d_path_len = len(dir_path)
#high_accuracy = args.percentage >= 100
# Default already followlinks=False for directory in os.walk() arg
for subdir, dirs, files in os.walk(dir_path, topdown=True, followlinks=args.follow_symlink): #[toprove:0] will topdown slower ?
# --max-depth 0 should do nothing, conforms to find command
if (max_depth is None) or ( subdir[d_path_len:].count(OS_SEP) < max_depth ):
for file in files:
ei+=1
path = OS_SEP.join([subdir, file])
# if file.is_file(): #scandir
# Due to this script main feature is store as symlink, so it may walk to -l directory and causes duplicated -s
# Default should not follows symlink
if not args.follow_symlink and os.path.islink(path):
if args.verbose:
print('Skipping symlink file: ' + path)
continue
if args.verbose:
print('Trying... [' + str(ei) + ']' + path)
if args.norm:
#img_f_np_ndarray
dir_img_f_np_ndarray = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
try:
dir_img_f_np_ndarray = cv2.resize(dir_img_f_np_ndarray, (RESIZE_X, RESIZE_Y)) # hole
except cv2.error:
# [1] 592631bd925880533f5e26993ef9eb4247b02a34r1-540-303_hq.gif with anim throws here
# [2] invalid image aHR0cDovL2ltZzQuaW1ndG4uYmRpbWcuY29tL2l0L3U9MzQwNzA2Mjk2LDIwMjg4MDgxNzYmZm09MjYmZ3A9MC5qcGc=.jpg
# ... also throws here
# Both of above able shows some value with avg/diff hash algo method.
if args.verbose:
print('\x1b[0;35mProbably not an image.\x1b[0m\x1b[K')
total_not_a_image += 1
continue
dir_img_f_np_ndarray = to_grayscale(dir_img_f_np_ndarray.astype(float))
# print('before norm dir: ' + repr(dir_img_f_np_ndarray))
dir_img_f_np_ndarray = normalize(dir_img_f_np_ndarray)
# print('after norm dir: ' + repr(dir_img_f_np_ndarray))
#AFAIS img_f_np_ndarray.size doesn't change after return from normalize() above
#n_m, n_0 = compare_images(img_f_np_ndarray, dir_img_f_np_ndarray)
n_m = compare_images(img_f_np_ndarray, dir_img_f_np_ndarray)
#n_0 < 15958 #goku_black_ssj_rose_palette_xenoverse_by_al3x796-dag4hgd.png #15800)
#careful if text "by pixel" nid to take into account of 0 which 100% match
#10648, 9600
#1000000
#9000, 16357
#if ( ( high_accuracy and ( ((n_m < 900000) and (n_0 < 7000)) or ( (n_m < 760000) and (n_0 < 8000) ) or ( (n_m < 660000) and (n_0 < 9000) ) ) ) or (not high_accuracy and (n_m < 1173136)) ): #1100000:
#if ( ( high_accuracy and ( (n_m < 27606771) and (n_0 < 334953) ) ) or (not high_accuracy and (n_m < 1173136)) ): #1100000:
#need <= to match 0.0
if n_m <= acceptable_manhattan_diff: #27606771: #1100000:
#sample:
#> 1022339.0 for 2012-08-13-205321_1920x1080_scrot.png
#print((n_0 * 1.0 / img_f_np_ndarray.size) )
print('\n[' + str(ei) + '] \x1b[1;35m[ %s\x1b[0m\x1b[K\x1b[1;35m ]\x1b[0m\x1b[K' % path)
#print('\x1b[1;32m[Matched]\x1b[0m\x1b[K Manhattan norm:', n_m, '/ per pixel:', n_m / img_f_np_ndarray.size,\
# ' Zero norm:', n_0, '/ per pixel:', n_0 * 1.0 / img_f_np_ndarray.size)
print('\x1b[1;32m[Matched]\x1b[0m\x1b[K Manhattan norm:', n_m, '/ per pixel:', n_m / img_f_np_ndarray.size)
total_matched += 1
create_symlink(ln_dir_path, path)
if args.show is not None:
if (args.show == 0) or ((total_matched) <= args.show):
Image.open(path).show()
'''
elif n_0 <= 334953: #(not high_accuracy and n_0 < 14580): # 2nd priority #16380 #16160 #15882:
print('\n[' + str(ei) + '] \x1b[1;35m[ %s\x1b[0m\x1b[K\x1b[1;35m ]\x1b[0m\x1b[K' % path)
print('Manhattan norm:', n_m, '/ per pixel:', n_m / img_f_np_ndarray.size,\
' \x1b[1;32m[Matched]\x1b[0m\x1b[K Zero norm:', n_0, '/ per pixel:', n_0 * 1.0 / img_f_np_ndarray.size)
total_matched += 1
create_symlink(ln_dir_path, path)
if args.show is not None:
if (args.show == 0) or ((total_matched) <= args.show):
Image.open(path).show()
'''
#elif n_0 <= acceptable_zero_diff:
# quit('Something wrong in algo, pls check.')
else:
create_symlink(lnm_dir_path, path)
total_not_match += 1
if not args.m:
print('\n[' + str(ei) + '] \x1b[1;35m[ %s\x1b[0m\x1b[K\x1b[1;35m ]\x1b[0m\x1b[K' % path)
#print('\x1b[1;31mNot Match: \x1b[0m\x1b[K Manhattan norm:', n_m, '/ per pixel:', n_m / img_f_np_ndarray.size,\
# ' \x1b[1;31mNot Match: \x1b[0m\x1b[K Zero norm:', n_0, '/ per pixel:', n_0 * 1.0 / img_f_np_ndarray.size)
print('\x1b[1;31mNot Match: \x1b[0m\x1b[K Manhattan norm:', n_m, '/ per pixel:', n_m / img_f_np_ndarray.size)
else:
if PY3: # 3.3 might different exception, but I only has latest version 3.6.8 to test
# , see https://stackoverflow.com/questions/28633555/how-to-handle-filenotfounderror-when-try-except-ioerror-does-not-catch-it
try:
dir_img_f_hash_diff, dir_img_f_hash_avg, write_once = diff_file(path, hash_cache_f, write_once, dir_path, False)
except FileNotFoundError:
if args.verbose:
print('No such file exists.')
continue
except OSError:
if args.verbose:
print('\x1b[0;35mProbably not an image.\x1b[0m\x1b[K')
total_not_a_image += 1
continue
except:
if args.verbose:
print('\x1b[0;35mUnexpected error:\x1b[0m', sys.exc_info()[0],'\x1b[K')
continue
else:
try:
dir_img_f_hash_diff, dir_img_f_hash_avg, write_once = diff_file(path, hash_cache_f, write_once, dir_path, False)
except IOError: # python 2 hard to distinct it's not an image VS no file exception
if args.verbose:
print('\x1b[0;35mProbably not an image.\x1b[0m\x1b[K')
total_not_a_image += 1
continue
except:
if args.verbose:
print('\x1b[0;35mUnexpected error:\x1b[0m', sys.exc_info()[0],'\x1b[K')
continue
if share_print(img_f_hash_diff, img_f_hash_avg, dir_img_f_hash_diff, dir_img_f_hash_avg, path, ei, total_matched):
total_matched += 1
create_symlink(ln_dir_path, path)
else:
total_not_match += 1
create_symlink(lnm_dir_path, path)
else:
if args.verbose:
print('Skipping directory by -md: ' + subdir)
else:
if args.verbose:
print('read cache')
dir_img_f_hash_diff = 0
dir_img_f_hash_avg = 0
from_path_len = 0
# the first 3 lines is: cache file structure version -> total of from_path's lines -> from_path
# the next 4 lines and so on would be: total of path's lines -> path -> diff hash -> avg hash
# The reason put total of lines is to support file path containing multiple lines
from_path = ''
done_read_header_version = False
done_read_header_path_count = False
done_read_header_path = False
done_read_header = False
path = ''
done_read_path_count = False
done_read_path = False
done_read_hash_diff = False
ei = 0
for l in hash_cache_f.readlines():
#print('reading ? ' + str(l))
if done_read_header:
if done_read_path_count:
if done_read_path:
if done_read_hash_diff: # done read hash diff and avg
dir_img_f_hash_avg = int(l)
ei+=1
path = path[:-1] #strip \n (only last line of multiple lines need to strip)
co = path[from_path_len+1:].count(OS_SEP)
#print('co path: ' + path)
#print('co: ' + str(co) + ' from_path_len: ' + str(from_path_len))
# --max-depth 0 should do nothing, conforms to find command
if (max_depth is None) or (co < max_depth):
if share_print(img_f_hash_diff, img_f_hash_avg, dir_img_f_hash_diff, dir_img_f_hash_avg, path, ei, total_matched):
total_matched += 1
create_symlink(ln_dir_path, path)
else:
total_not_match += 1
create_symlink(lnm_dir_path, path)
else:
if args.verbose:
print('Skipping directory by -md: ' + path)
#reset before next read file block
path = ''
done_read_path_count = False
done_read_path = False
done_read_hash_diff = False
else: # read hash avg
dir_img_f_hash_diff = int(l)
done_read_hash_diff = True
else:
path_total-=1
path+=l #[:-1]
if path_total == 0:
done_read_path = True
else:
#print('reading: ' + str(l))
path_total = int(l)
done_read_path_count = True
elif not done_read_header_version:
done_read_header_version = True
try:
curr_cache_file_version = float(l)
except ValueError:
curr_cache_file_version = 1.0 #first version
if curr_cache_file_version != __cache_file_version__:
quit('Cache file structure already changed in new version. Please use -w to re-cache.')
elif not done_read_header_path_count:
done_read_header_path_count = True
path_total = int(l)
else:
path_total-=1
if path_total == 0:
from_path+=l[:-1] # remove trailing \n
from_path_len+=len(from_path) #count len in last line, not accum every line
if args.verbose:
print('cache header path is: ' + from_path)
done_read_header = True
done_read_path_count = False
else:
from_path+=l
if not args.norm:
try:
img_f.fp.close()
except AttributeError:
pass
if hash_cache_f is not None:
hash_cache_f.close()
print()
if args.c is False:
# Since -c always 0 so -c no nid print
# Even file change to non-image intentionally also fine since it only read cache without re-scan at all
#, except scan input image.
if total_not_a_image > 0:
print(
'\x1b[0;52mTotal of\x1b[0m\x1b[K \x1b[0;35mnot an image: \x1b[0m\x1b[K' + str(total_not_a_image))
print('\x1b[1;52mTotal of\x1b[0m\x1b[K \x1b[1;35mimages: \x1b[0m\x1b[K' +
str(total_not_match + total_matched))
print('\x1b[1;52mTotal of\x1b[0m\x1b[K \x1b[1;31mnot match: \x1b[0m\x1b[K' +
str(total_not_match))
print(
'\x1b[1;52mTotal of\x1b[0m\x1b[K \x1b[1;32mmatched: \x1b[0m\x1b[K' + str(total_matched))