Skip to content

Commit bbf2de5

Browse files
committed
optimizations
1 parent 90536f2 commit bbf2de5

File tree

1 file changed

+39
-51
lines changed

1 file changed

+39
-51
lines changed

src/core.py

+39-51
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,9 @@ def fnumber(num):
110110
class CRCThreadedCalc:
111111
def __init__(self,log):
112112
self.log=log
113-
self.source=[]
114-
self.source_other_data=[]
115113

116-
#self.results=[]
117-
self.results_dict={}
114+
self.data_dict={}
115+
118116
self.file_info=(0,None)
119117
self.progress_info=0
120118
self.abort_action=False
@@ -134,61 +132,58 @@ def abort(self):
134132
def calc(self):
135133
from hashlib import sha1
136134

137-
CRC_BUFFER_SIZE=4*1024*1024
138-
buf = bytearray(CRC_BUFFER_SIZE)
139-
view = memoryview(buf)
135+
size_threshold=8*1024*1024
136+
block_size=1*1024*1024
140137

141138
self.started=True
142139

143-
#preallocate
144-
#self.results=[None]*len(self.source)
145-
self_results_dict = self.results_dict = {i:None for i in range(len(self.source))}
146-
#self_results=self.results
140+
self_data_dict = self.data_dict
147141

148142
self.size_done = 0
149143
self.files_done = 0
150144

151-
#i=0
152-
for fullpath,size in self.source:
145+
files_done_local = 0
146+
147+
for (size,fullpath),(pathnr,path,file_name,mtime,ctime,inode) in list(sorted(self_data_dict.items(),key = lambda x : int(x[0][0]),reverse=True)):
153148
try:
154149
file_handle=open(fullpath,'rb')
155-
file_handle_readinto=file_handle.readinto
156150
except Exception as e:
157151
self.log.error(e)
158152

159153
if self.abort_action:
160-
return
154+
sys_exit()
161155
else:
162-
hasher = sha1()
163-
hasher_update=hasher.update
156+
if size<size_threshold:
157+
self_data_dict[(size,fullpath)]=(pathnr,path,file_name,mtime,ctime,inode,sha1(file_handle.read()).hexdigest())
158+
file_handle.close()
164159

165-
#faster for smaller files
166-
if size<CRC_BUFFER_SIZE:
167-
hasher_update(view[:file_handle_readinto(buf)])
160+
if self.abort_action:
161+
sys_exit()
168162
else:
169-
while rsize := file_handle_readinto(buf):
170-
hasher_update(view[:rsize])
163+
hasher = sha1()
164+
hasher_update=hasher.update
165+
166+
file_handle_read_block_size=lambda : file_handle.read(block_size)
167+
while chunk := file_handle_read_block_size():
168+
hasher_update(chunk)
171169

172-
if rsize==CRC_BUFFER_SIZE:
173-
#still reading
174-
self.progress_info+=rsize
170+
self.progress_info+=len(chunk)
175171

176172
if self.abort_action:
177173
break
178174

179175
self.progress_info=0
176+
file_handle.close()
180177

181-
file_handle.close()
182-
183-
if self.abort_action:
184-
return
178+
if self.abort_action:
179+
sys_exit() #thread
185180

186-
#only complete result
187-
#self_results_dict[self.files_done]=self_results[self.files_done]=hasher.hexdigest()
188-
self_results_dict[self.files_done]=hasher.hexdigest()
181+
#only complete result
182+
self_data_dict[(size,fullpath)]=(pathnr,path,file_name,mtime,ctime,inode,hasher.hexdigest())
189183

190184
self.size_done += size
191-
self.files_done += 1
185+
files_done_local += 1
186+
self.files_done = files_done_local
192187

193188
sys_exit() #thread
194189

@@ -1246,9 +1241,7 @@ def crc_calc(self):
12461241

12471242
fullpath=self.get_full_path_to_scan(pathnr,path,file_name)
12481243

1249-
crc_core_dev = crc_core[dev]
1250-
crc_core_dev.source.append( (fullpath,size) )
1251-
crc_core_dev.source_other_data.append( (pathnr,path,file_name,mtime,ctime,inode) )
1244+
crc_core[dev].data_dict[(size,fullpath)]=(pathnr,path,file_name,mtime,ctime,inode)
12521245

12531246
self.info=''
12541247
self.log.info('using cache done.')
@@ -1329,10 +1322,9 @@ def crc_calc(self):
13291322
crc_to_combo=defaultdict(set)
13301323

13311324
for dev in self_devs:
1332-
#for (fullpath,size),crc in zip(crc_core[dev].source,crc_core[dev].results):
1333-
for (fullpath,size),crc in zip(crc_core[dev].source,[val for key,val in sorted(crc_core[dev].results_dict.items(), key = lambda x : x[0])] ):
1334-
if crc:
1335-
crc_to_combo[crc].add( (size,dirname(fullpath)) )
1325+
for (size,fullpath),val in crc_core[dev].data_dict.items():
1326+
if len(val)==7:
1327+
crc_to_combo[val[6]].add( (size,dirname(fullpath)) )
13361328

13371329
for size,size_dict in self_files_of_size_of_crc_items():
13381330
for crc,crc_dict in size_dict.items():
@@ -1345,8 +1337,7 @@ def crc_calc(self):
13451337
temp_info_folders_set=set()
13461338

13471339
for crc,crc_combo in crc_to_combo.items():
1348-
len_crc_combo = len(crc_combo)
1349-
if len_crc_combo>1:
1340+
if len(crc_combo)>1:
13501341
temp_info_groups+=1
13511342
for size,dirpath in crc_combo:
13521343
temp_info_dupe_space+=size
@@ -1372,15 +1363,13 @@ def crc_calc(self):
13721363
if crc_core_dev.started:
13731364
crc_core_dev.join()
13741365

1375-
#for (fullpath,size),(pathnr,path,file_name,mtime,ctime,inode),crc in zip(crc_core_dev.source,crc_core_dev.source_other_data,crc_core_dev.results):
1376-
for (fullpath,size),(pathnr,path,file_name,mtime,ctime,inode),crc in zip(crc_core_dev.source,crc_core_dev.source_other_data,[val for key,val in sorted(crc_core_dev.results_dict.items(), key = lambda x : x[0])]):
1377-
if crc:
1378-
index_tuple=(pathnr,path,file_name,ctime,dev,inode)
1379-
1380-
self_files_of_size_of_crc[size][crc].add( index_tuple )
1366+
#for (size,fullpath),val in sorted(crc_core_dev.data_dict.items(), key = lambda x : int(x[0][0]), reverse=True):
1367+
for (size,fullpath),val in crc_core_dev.data_dict.items():
1368+
if len(val)==7:
1369+
pathnr,path,file_name,mtime,ctime,inode,crc=val
1370+
self_files_of_size_of_crc[size][crc].add( (pathnr,path,file_name,ctime,dev,inode) )
13811371

1382-
cache_key=(inode,mtime)
1383-
self.crc_cache[dev][cache_key]=crc
1372+
self.crc_cache[dev][(inode,mtime)]=crc
13841373
del crc_core
13851374
########################################################################
13861375

@@ -1682,7 +1671,6 @@ def link_wrapper(self,\
16821671
(path_nr_keep,path_keep,file_keep,ctime_keep,dev_keep,inode_keep)=index_tuple_ref
16831672

16841673
self_get_full_path_scanned = self.get_full_path_scanned
1685-
#self_files_of_size_of_crc_size_crc = self.files_of_size_of_crc[size][crc]
16861674

16871675
if operation_mode in (MODE_SIMILARITY,MODE_GPS):
16881676
print('imposible1')

0 commit comments

Comments
 (0)