From cd98a60d0b361410d9e3320943fa13bb2b785a21 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Thu, 20 Sep 2018 14:33:20 -0700 Subject: [PATCH 1/6] avoid recursionlimit error --- python/mxnet/gluon/data/dataloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py index 1c54158a2ba4..1ab065b6cd0a 100644 --- a/python/mxnet/gluon/data/dataloader.py +++ b/python/mxnet/gluon/data/dataloader.py @@ -175,7 +175,9 @@ def _recursive_fork_recordio(obj, depth, max_depth=1000): def worker_loop(dataset, key_queue, data_queue, batchify_fn): """Worker loop for multiprocessing DataLoader.""" # re-fork a new recordio handler in new process if applicable - _recursive_fork_recordio(dataset, 0, 1000) + if sys.getrecursionlimit() < 1000: + sys.setrecursionlimit(1000) + _recursive_fork_recordio(dataset, 0, 1000 - 5) # reserve 5 stack in ops while True: idx, samples = key_queue.get() From a64b72177a1c04bfb0fb2112a344c6cffabdbfd2 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Thu, 20 Sep 2018 14:49:01 -0700 Subject: [PATCH 2/6] add unittest --- python/mxnet/gluon/data/dataloader.py | 4 +- tests/python/unittest/test_gluon_data.py | 47 +++++++++++++++--------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py index 1ab065b6cd0a..c0d2dc1a28a0 100644 --- a/python/mxnet/gluon/data/dataloader.py +++ b/python/mxnet/gluon/data/dataloader.py @@ -175,9 +175,7 @@ def _recursive_fork_recordio(obj, depth, max_depth=1000): def worker_loop(dataset, key_queue, data_queue, batchify_fn): """Worker loop for multiprocessing DataLoader.""" # re-fork a new recordio handler in new process if applicable - if sys.getrecursionlimit() < 1000: - sys.setrecursionlimit(1000) - _recursive_fork_recordio(dataset, 0, 1000 - 5) # reserve 5 stack in ops + _recursive_fork_recordio(dataset, 0, sys.getrecursionlimit() - 5) while True: idx, samples = key_queue.get() diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py index cc80aacb6447..75f7aa60e57c 100644 --- a/tests/python/unittest/test_gluon_data.py +++ b/tests/python/unittest/test_gluon_data.py @@ -75,24 +75,35 @@ def test_recordimage_dataset(): @with_seed() def test_recordimage_dataset_with_data_loader_multiworker(): - # This test is pointless on Windows because Windows doesn't fork - if platform.system() != 'Windows': - recfile = prepare_record() - dataset = gluon.data.vision.ImageRecordDataset(recfile) - loader = gluon.data.DataLoader(dataset, 1, num_workers=5) - - for i, (x, y) in enumerate(loader): - assert x.shape[0] == 1 and x.shape[3] == 3 - assert y.asscalar() == i - - # with transform - fn = lambda x, y : (x, y) - dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) - loader = gluon.data.DataLoader(dataset, 1, num_workers=5) - - for i, (x, y) in enumerate(loader): - assert x.shape[0] == 1 and x.shape[3] == 3 - assert y.asscalar() == i + recfile = prepare_record() + dataset = gluon.data.vision.ImageRecordDataset(recfile) + loader = gluon.data.DataLoader(dataset, 1, num_workers=5) + + for i, (x, y) in enumerate(loader): + assert x.shape[0] == 1 and x.shape[3] == 3 + assert y.asscalar() == i + + # with transform + fn = lambda x, y : (x, y) + dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) + loader = gluon.data.DataLoader(dataset, 1, num_workers=5) + + for i, (x, y) in enumerate(loader): + assert x.shape[0] == 1 and x.shape[3] == 3 + assert y.asscalar() == i + + # try limit recursion depth + import sys + old_limit = sys.getrecursionlimit() + sys.setrecursionlimit(10) # this should be smaller than any default value used in python + fn = lambda x, y : (x, y) + dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) + loader = gluon.data.DataLoader(dataset, 1, num_workers=5) + + for i, (x, y) in enumerate(loader): + assert x.shape[0] == 1 and x.shape[3] == 3 + assert y.asscalar() == i + sys.setrecursionlimit(old_limit) @with_seed() def test_sampler(): From e2d65653d16e38e8f949b49e0c8b9d3e5821c43c Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Thu, 20 Sep 2018 16:28:44 -0700 Subject: [PATCH 3/6] resursion limit 100 --- tests/python/unittest/test_gluon_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py index 75f7aa60e57c..b0c9635069ac 100644 --- a/tests/python/unittest/test_gluon_data.py +++ b/tests/python/unittest/test_gluon_data.py @@ -95,7 +95,7 @@ def test_recordimage_dataset_with_data_loader_multiworker(): # try limit recursion depth import sys old_limit = sys.getrecursionlimit() - sys.setrecursionlimit(10) # this should be smaller than any default value used in python + sys.setrecursionlimit(100) # this should be smaller than any default value used in python fn = lambda x, y : (x, y) dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) loader = gluon.data.DataLoader(dataset, 1, num_workers=5) From 2c09b4480b11534ddacc461f66733a8289b19178 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Thu, 20 Sep 2018 18:14:10 -0700 Subject: [PATCH 4/6] fix pickling void* in windows --- python/mxnet/recordio.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py index 2ebe657accbd..4a944b900a56 100644 --- a/python/mxnet/recordio.py +++ b/python/mxnet/recordio.py @@ -83,6 +83,17 @@ def open(self): def __del__(self): self.close() + def __getstate__(self): + # pickling pointer is not allowed + d = dict(self.__dict__) + d['is_open'] = False + del d['handle'] + return d + + def __setstate__(self, d): + self.__dict__ = d + self.handle = RecordIOHandle() + def close(self): """Closes the record file.""" if not self.is_open: From aaef2313ab667d0ba34ee3ec62700e37dbbc0ba4 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Mon, 24 Sep 2018 00:12:58 -0700 Subject: [PATCH 5/6] fix picking for windows and unittest --- python/mxnet/recordio.py | 23 ++++++++++++++++++++++- tests/python/unittest/test_gluon_data.py | 12 +++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py index 4a944b900a56..6fc4d8e7bf57 100644 --- a/python/mxnet/recordio.py +++ b/python/mxnet/recordio.py @@ -84,15 +84,30 @@ def __del__(self): self.close() def __getstate__(self): + """Override pickling behavior.""" # pickling pointer is not allowed + is_open = self.is_open + self.close() d = dict(self.__dict__) - d['is_open'] = False + d['is_open'] = is_open + uri = self.uri.value + try: + uri = uri.decode('utf-8') + except AttributeError: + pass del d['handle'] + d['uri'] = uri return d def __setstate__(self, d): + """Restore from pickled.""" self.__dict__ = d + is_open = d['is_open'] + self.is_open = False self.handle = RecordIOHandle() + self.uri = c_str(self.uri) + if is_open: + self.open() def close(self): """Closes the record file.""" @@ -228,6 +243,12 @@ def close(self): super(MXIndexedRecordIO, self).close() self.fidx.close() + def __getstate__(self): + """Override pickling behavior.""" + d = super(MXIndexedRecordIO, self).__getstate__() + d['fidx'] = None + return d + def seek(self, idx): """Sets the current read pointer position. diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py index b0c9635069ac..c731f8d782d1 100644 --- a/tests/python/unittest/test_gluon_data.py +++ b/tests/python/unittest/test_gluon_data.py @@ -73,6 +73,10 @@ def test_recordimage_dataset(): assert x.shape[0] == 1 and x.shape[3] == 3 assert y.asscalar() == i +def _dataset_transform_fn(x, y): + """Named transform function since lambda function cannot be pickled.""" + return x, y + @with_seed() def test_recordimage_dataset_with_data_loader_multiworker(): recfile = prepare_record() @@ -84,8 +88,7 @@ def test_recordimage_dataset_with_data_loader_multiworker(): assert y.asscalar() == i # with transform - fn = lambda x, y : (x, y) - dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) + dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn) loader = gluon.data.DataLoader(dataset, 1, num_workers=5) for i, (x, y) in enumerate(loader): @@ -95,9 +98,8 @@ def test_recordimage_dataset_with_data_loader_multiworker(): # try limit recursion depth import sys old_limit = sys.getrecursionlimit() - sys.setrecursionlimit(100) # this should be smaller than any default value used in python - fn = lambda x, y : (x, y) - dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(fn) + sys.setrecursionlimit(500) # this should be smaller than any default value used in python + dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn) loader = gluon.data.DataLoader(dataset, 1, num_workers=5) for i, (x, y) in enumerate(loader): From 6cf0e3d3bc2d4b4530ad15f92f4c895eb40253d3 Mon Sep 17 00:00:00 2001 From: "Joshua Z. Zhang" Date: Mon, 24 Sep 2018 11:44:51 -0700 Subject: [PATCH 6/6] explain --- python/mxnet/gluon/data/dataloader.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py index c0d2dc1a28a0..50e2ad9f784d 100644 --- a/python/mxnet/gluon/data/dataloader.py +++ b/python/mxnet/gluon/data/dataloader.py @@ -175,7 +175,12 @@ def _recursive_fork_recordio(obj, depth, max_depth=1000): def worker_loop(dataset, key_queue, data_queue, batchify_fn): """Worker loop for multiprocessing DataLoader.""" # re-fork a new recordio handler in new process if applicable - _recursive_fork_recordio(dataset, 0, sys.getrecursionlimit() - 5) + # for a dataset with transform function, the depth of MXRecordIO is 1 + # for a lazy transformer, the depth is 2 + # for a user defined transformer, the depth is unknown, try a reasonable depth + limit = sys.getrecursionlimit() + max_recursion_depth = min(limit - 5, max(10, limit // 2)) + _recursive_fork_recordio(dataset, 0, max_recursion_depth) while True: idx, samples = key_queue.get()