Skip to content

Commit

Permalink
Issue #104: Improve performance restore session
Browse files Browse the repository at this point in the history
  • Loading branch information
Nekmo committed Mar 6, 2022
1 parent 8aed3cb commit 919e44f
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions dirhunt/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def add_init_urls(self, *urls):
if not isinstance(crawler_url, CrawlerUrl):
crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
self.add_domain(crawler_url.url.only_domain)
self.add_url(crawler_url)
self.add_url(crawler_url, lock=False)

def in_domains(self, domain):
if self.not_follow_subdomains and domain not in self.domains:
Expand All @@ -98,29 +98,36 @@ def add_domain(self, domain):
self.domains.add(domain)
self.sources.add_domain(domain)

def add_url(self, crawler_url, force=False):
def add_url(self, crawler_url, force=False, lock=True):
"""Add url to queue"""
if self.closing:
return
if not isinstance(crawler_url, CrawlerUrl):
crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
self.add_lock.acquire()
if lock:
self.add_lock.acquire()
url = crawler_url.url
if not url.is_valid() or not url.only_domain or not self.in_domains(url.only_domain):
self.add_lock.release()
if lock:
self.add_lock.release()
return
if url.url in self.processing or url.url in self.processed:
self.add_lock.release()
if lock:
self.add_lock.release()
return self.processing.get(url.url) or self.processed.get(url.url)

fn = reraise_with_stack(crawler_url.start)
if self.closing:
self.add_lock.release()
if lock:
self.add_lock.release()
return
if force:
future = ThreadPoolExecutor(max_workers=1).submit(fn)
else:
future = self.submit(fn)
self.processing[url.url] = future
self.add_lock.release()
if lock:
self.add_lock.release()
return future

def add_message(self, body):
Expand Down Expand Up @@ -173,6 +180,8 @@ def print_results(self, exclude=None, include=None):
if self.current_processed_count >= self.limit and self.limit:
# Force shutdown
self.closing = True
self.erase()
self.echo('Results limit reached ({}). Finishing...'.format(self.limit))
self.shutdown()
self.erase()
self.print_progress(True)
Expand Down Expand Up @@ -252,7 +261,7 @@ def resume(self, path):
self.processed[url] = crawler_url
self.echo(data['line'])
for url in resume_data['processing']:
self.add_url(url)
self.add_url(url, lock=False)

def json(self):
urls_infos = self.urls_info.urls_info if self.urls_info else []
Expand Down

0 comments on commit 919e44f

Please sign in to comment.