@@ -110,11 +110,9 @@ def fnumber(num):
110
110
class CRCThreadedCalc :
111
111
def __init__ (self ,log ):
112
112
self .log = log
113
- self .source = []
114
- self .source_other_data = []
115
113
116
- # self.results=[]
117
- self . results_dict = {}
114
+ self .data_dict = {}
115
+
118
116
self .file_info = (0 ,None )
119
117
self .progress_info = 0
120
118
self .abort_action = False
@@ -134,61 +132,58 @@ def abort(self):
134
132
def calc (self ):
135
133
from hashlib import sha1
136
134
137
- CRC_BUFFER_SIZE = 4 * 1024 * 1024
138
- buf = bytearray (CRC_BUFFER_SIZE )
139
- view = memoryview (buf )
135
+ size_threshold = 8 * 1024 * 1024
136
+ block_size = 1 * 1024 * 1024
140
137
141
138
self .started = True
142
139
143
- #preallocate
144
- #self.results=[None]*len(self.source)
145
- self_results_dict = self .results_dict = {i :None for i in range (len (self .source ))}
146
- #self_results=self.results
140
+ self_data_dict = self .data_dict
147
141
148
142
self .size_done = 0
149
143
self .files_done = 0
150
144
151
- #i=0
152
- for fullpath ,size in self .source :
145
+ files_done_local = 0
146
+
147
+ for (size ,fullpath ),(pathnr ,path ,file_name ,mtime ,ctime ,inode ) in list (sorted (self_data_dict .items (),key = lambda x : int (x [0 ][0 ]),reverse = True )):
153
148
try :
154
149
file_handle = open (fullpath ,'rb' )
155
- file_handle_readinto = file_handle .readinto
156
150
except Exception as e :
157
151
self .log .error (e )
158
152
159
153
if self .abort_action :
160
- return
154
+ sys_exit ()
161
155
else :
162
- hasher = sha1 ()
163
- hasher_update = hasher .update
156
+ if size < size_threshold :
157
+ self_data_dict [(size ,fullpath )]= (pathnr ,path ,file_name ,mtime ,ctime ,inode ,sha1 (file_handle .read ()).hexdigest ())
158
+ file_handle .close ()
164
159
165
- #faster for smaller files
166
- if size < CRC_BUFFER_SIZE :
167
- hasher_update (view [:file_handle_readinto (buf )])
160
+ if self .abort_action :
161
+ sys_exit ()
168
162
else :
169
- while rsize := file_handle_readinto (buf ):
170
- hasher_update (view [:rsize ])
163
+ hasher = sha1 ()
164
+ hasher_update = hasher .update
165
+
166
+ file_handle_read_block_size = lambda : file_handle .read (block_size )
167
+ while chunk := file_handle_read_block_size ():
168
+ hasher_update (chunk )
171
169
172
- if rsize == CRC_BUFFER_SIZE :
173
- #still reading
174
- self .progress_info += rsize
170
+ self .progress_info += len (chunk )
175
171
176
172
if self .abort_action :
177
173
break
178
174
179
175
self .progress_info = 0
176
+ file_handle .close ()
180
177
181
- file_handle .close ()
182
-
183
- if self .abort_action :
184
- return
178
+ if self .abort_action :
179
+ sys_exit () #thread
185
180
186
- #only complete result
187
- #self_results_dict[self.files_done]=self_results[self.files_done]=hasher.hexdigest()
188
- self_results_dict [self .files_done ]= hasher .hexdigest ()
181
+ #only complete result
182
+ self_data_dict [(size ,fullpath )]= (pathnr ,path ,file_name ,mtime ,ctime ,inode ,hasher .hexdigest ())
189
183
190
184
self .size_done += size
191
- self .files_done += 1
185
+ files_done_local += 1
186
+ self .files_done = files_done_local
192
187
193
188
sys_exit () #thread
194
189
@@ -1246,9 +1241,7 @@ def crc_calc(self):
1246
1241
1247
1242
fullpath = self .get_full_path_to_scan (pathnr ,path ,file_name )
1248
1243
1249
- crc_core_dev = crc_core [dev ]
1250
- crc_core_dev .source .append ( (fullpath ,size ) )
1251
- crc_core_dev .source_other_data .append ( (pathnr ,path ,file_name ,mtime ,ctime ,inode ) )
1244
+ crc_core [dev ].data_dict [(size ,fullpath )]= (pathnr ,path ,file_name ,mtime ,ctime ,inode )
1252
1245
1253
1246
self .info = ''
1254
1247
self .log .info ('using cache done.' )
@@ -1329,10 +1322,9 @@ def crc_calc(self):
1329
1322
crc_to_combo = defaultdict (set )
1330
1323
1331
1324
for dev in self_devs :
1332
- #for (fullpath,size),crc in zip(crc_core[dev].source,crc_core[dev].results):
1333
- for (fullpath ,size ),crc in zip (crc_core [dev ].source ,[val for key ,val in sorted (crc_core [dev ].results_dict .items (), key = lambda x : x [0 ])] ):
1334
- if crc :
1335
- crc_to_combo [crc ].add ( (size ,dirname (fullpath )) )
1325
+ for (size ,fullpath ),val in crc_core [dev ].data_dict .items ():
1326
+ if len (val )== 7 :
1327
+ crc_to_combo [val [6 ]].add ( (size ,dirname (fullpath )) )
1336
1328
1337
1329
for size ,size_dict in self_files_of_size_of_crc_items ():
1338
1330
for crc ,crc_dict in size_dict .items ():
@@ -1345,8 +1337,7 @@ def crc_calc(self):
1345
1337
temp_info_folders_set = set ()
1346
1338
1347
1339
for crc ,crc_combo in crc_to_combo .items ():
1348
- len_crc_combo = len (crc_combo )
1349
- if len_crc_combo > 1 :
1340
+ if len (crc_combo )> 1 :
1350
1341
temp_info_groups += 1
1351
1342
for size ,dirpath in crc_combo :
1352
1343
temp_info_dupe_space += size
@@ -1372,15 +1363,13 @@ def crc_calc(self):
1372
1363
if crc_core_dev .started :
1373
1364
crc_core_dev .join ()
1374
1365
1375
- #for (fullpath,size),(pathnr,path,file_name,mtime,ctime,inode),crc in zip(crc_core_dev.source,crc_core_dev.source_other_data,crc_core_dev.results):
1376
- for (fullpath ,size ),(pathnr ,path ,file_name ,mtime ,ctime ,inode ),crc in zip (crc_core_dev .source ,crc_core_dev .source_other_data ,[val for key ,val in sorted (crc_core_dev .results_dict .items (), key = lambda x : x [0 ])]):
1377
- if crc :
1378
- index_tuple = (pathnr ,path ,file_name ,ctime ,dev ,inode )
1379
-
1380
- self_files_of_size_of_crc [size ][crc ].add ( index_tuple )
1366
+ #for (size,fullpath),val in sorted(crc_core_dev.data_dict.items(), key = lambda x : int(x[0][0]), reverse=True):
1367
+ for (size ,fullpath ),val in crc_core_dev .data_dict .items ():
1368
+ if len (val )== 7 :
1369
+ pathnr ,path ,file_name ,mtime ,ctime ,inode ,crc = val
1370
+ self_files_of_size_of_crc [size ][crc ].add ( (pathnr ,path ,file_name ,ctime ,dev ,inode ) )
1381
1371
1382
- cache_key = (inode ,mtime )
1383
- self .crc_cache [dev ][cache_key ]= crc
1372
+ self .crc_cache [dev ][(inode ,mtime )]= crc
1384
1373
del crc_core
1385
1374
########################################################################
1386
1375
@@ -1682,7 +1671,6 @@ def link_wrapper(self,\
1682
1671
(path_nr_keep ,path_keep ,file_keep ,ctime_keep ,dev_keep ,inode_keep )= index_tuple_ref
1683
1672
1684
1673
self_get_full_path_scanned = self .get_full_path_scanned
1685
- #self_files_of_size_of_crc_size_crc = self.files_of_size_of_crc[size][crc]
1686
1674
1687
1675
if operation_mode in (MODE_SIMILARITY ,MODE_GPS ):
1688
1676
print ('imposible1' )
0 commit comments