Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[BUGFIX] fix model zoo parallel download #17336

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions python/mxnet/gluon/model_zoo/model_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os
import zipfile
import logging
import uuid

from ..utils import download, check_sha1
from ... import base, util
Expand Down Expand Up @@ -93,6 +94,8 @@ def get_model_file(name, root=os.path.join(base.data_dir(), 'models')):
root = os.path.expanduser(root)
file_path = os.path.join(root, file_name+'.params')
sha1_hash = _model_sha1[name]

random_uuid = str(uuid.uuid4())
if os.path.exists(file_path):
if check_sha1(file_path, sha1_hash):
return file_path
Expand All @@ -103,16 +106,16 @@ def get_model_file(name, root=os.path.join(base.data_dir(), 'models')):

util.makedirs(root)

zip_file_path = os.path.join(root, file_name+'.zip')
temp_zip_file_path = os.path.join(root, file_name+random_uuid+'.zip')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to use tempfile so it takes care of clean up automatically: https://docs.python.org/3/library/tempfile.html

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe tempfile is typically on a different filesystem, so there is no atomic rename operation to the target directory and filename available.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@leezu What do you mean by a different filesystem? Regarding name, I think you can also specify a file name using the NamedTemporaryfile class?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, I overlooked the dir argument.

repo_url = os.environ.get('MXNET_GLUON_REPO', apache_repo_url)
if repo_url[-1] != '/':
repo_url = repo_url + '/'
download(_url_format.format(repo_url=repo_url, file_name=file_name),
path=zip_file_path,
path=temp_zip_file_path,
overwrite=True)
with zipfile.ZipFile(zip_file_path) as zf:
with zipfile.ZipFile(temp_zip_file_path) as zf:
zf.extractall(root)
os.remove(zip_file_path)
os.remove(temp_zip_file_path)

if check_sha1(file_path, sha1_hash):
return file_path
Expand Down
4 changes: 2 additions & 2 deletions python/mxnet/gluon/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import numpy as np

from .. import ndarray
from ..util import is_np_shape, is_np_array
from ..util import is_np_shape, is_np_array, makedirs
from .. import numpy as _mx_np # pylint: disable=reimported


Expand Down Expand Up @@ -298,7 +298,7 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_
if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
if not os.path.exists(dirname):
os.makedirs(dirname)
makedirs(dirname)
while retries + 1 > 0:
# Disable pyling too broad Exception
# pylint: disable=W0703
Expand Down
16 changes: 16 additions & 0 deletions tests/python/unittest/test_gluon_model_zoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,28 @@
import mxnet as mx
from mxnet.gluon.model_zoo.vision import get_model
import sys
import threading
from common import setup_module, with_seed, teardown


def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)

@with_seed()
def test_parallel_download():
def fn(model_name):
model = get_model(model_name, pretrained=True)#, root='parallel_model/')
print(type(model))

threads = []
name = 'mobilenetv2_0.25'
for _ in range(10):
x = threading.Thread(target=fn, args=(name,))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use multiprocess to test? My understand is that in horovod get_model is parallelized at process level not thread level?

threads.append(x)
for t in threads:
t.start()
for t in threads:
t.join()

@with_seed()
def test_models():
Expand Down