Skip to content

Commit

Permalink
Resolver.download_to_directory: use retry mechanism with backoff stra…
Browse files Browse the repository at this point in the history
…tegy timing
  • Loading branch information
bertsky committed Jul 12, 2023
1 parent 4d17f06 commit 3b040b7
Showing 1 changed file with 36 additions and 40 deletions.
76 changes: 36 additions & 40 deletions ocrd/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from os import environ

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from ocrd.constants import TMP_PREFIX
from ocrd_utils import (
Expand Down Expand Up @@ -107,46 +109,40 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
log.debug("Downloading URL '%s' to '%s'", url, dst_path)
if 'OCRD_DOWNLOAD_RETRIES' in environ:
retries = retries or int(environ['OCRD_DOWNLOAD_RETRIES'])
retries = retries or 0
for _ in range(retries + 1):
try:
if timeout is None and 'OCRD_DOWNLOAD_TIMEOUT' in environ:
timeout = environ['OCRD_DOWNLOAD_TIMEOUT'].split(',')
if len(timeout) > 1:
timeout = tuple(float(x) for x in timeout)
else:
timeout = float(timeout[0])
response = requests.get(url, timeout=timeout)
if response.status_code in [
# probably too wide:
408, # Request Timeout
409, # Conflict
410, # Gone
412, # Precondition Failed
417, # Expectation Failed
423, # Locked
424, # Fail
425, # Too Early
426, # Upgrade Required
428, # Precondition Required
429, # Too Many Requests
440, # Login Timeout
500, # Internal Server Error
503, # Service Unavailable
504, # Gateway Timeout
509, # Bandwidth Limit Exceeded
529, # Site Overloaded
598, # Proxy Read Timeout
599, # Proxy Connect Timeout
]:
continue
except (requests.Timeout, requests.ConnectionError) as error:
response = error
continue
if isinstance(response, Exception):
raise Exception("HTTP request failed: %s (%s)" % (url, response))
if response.status_code != 200:
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
if timeout is None and 'OCRD_DOWNLOAD_TIMEOUT' in environ:
timeout = environ['OCRD_DOWNLOAD_TIMEOUT'].split(',')
if len(timeout) > 1:
timeout = tuple(float(x) for x in timeout)
else:
timeout = float(timeout[0])
session = requests.Session()
retries = Retry(total=retries or 0,
status_forcelist=[
# probably too wide (only transient failures):
408, # Request Timeout
409, # Conflict
412, # Precondition Failed
417, # Expectation Failed
423, # Locked
424, # Fail
425, # Too Early
426, # Upgrade Required
428, # Precondition Required
429, # Too Many Requests
440, # Login Timeout
500, # Internal Server Error
503, # Service Unavailable
504, # Gateway Timeout
509, # Bandwidth Limit Exceeded
529, # Site Overloaded
598, # Proxy Read Timeout
599, # Proxy Connect Timeout
])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
response = session.get(url, timeout=timeout)
response.raise_for_status()
contents = handle_oai_response(response)
dst_path.write_bytes(contents)

Expand Down

0 comments on commit 3b040b7

Please sign in to comment.