diff --git a/docs/source/python/filesystems_deprecated.rst b/docs/source/python/filesystems_deprecated.rst index 51a07d5efa6..04887e97738 100644 --- a/docs/source/python/filesystems_deprecated.rst +++ b/docs/source/python/filesystems_deprecated.rst @@ -18,9 +18,9 @@ Filesystem Interface (legacy) ============================= -.. note:: - This section documents the deprecated filesystem layer. It is highly - recommended to use the :ref:`new filesystem layer ` instead. +.. warning:: + This section documents the deprecated filesystem layer. You should + use the :ref:`new filesystem layer ` instead. .. _hdfs: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9e22cc013f8..9f544a12184 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -32,6 +32,7 @@ import gc as _gc import os as _os import sys as _sys +import warnings as _warnings try: from ._generated_version import version as __version__ @@ -190,23 +191,53 @@ def show_versions(): SerializationCallbackError, DeserializationCallbackError) -from pyarrow.filesystem import FileSystem, LocalFileSystem - -from pyarrow.hdfs import HadoopFileSystem import pyarrow.hdfs as hdfs from pyarrow.ipc import serialize_pandas, deserialize_pandas import pyarrow.ipc as ipc - -localfs = LocalFileSystem.get_instance() - from pyarrow.serialization import (default_serialization_context, register_default_serialization_handlers, register_torch_serialization_handlers) import pyarrow.types as types + +# deprecated filesystems + +from pyarrow.filesystem import FileSystem as _FileSystem, LocalFileSystem as _LocalFileSystem +from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem + +_localfs = _LocalFileSystem._get_instance() + + +_msg = "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead." + +_deprecated = { + "localfs": (_localfs, "LocalFileSystem"), + "FileSystem": (_FileSystem, "FileSystem"), + "LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"), + "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"), +} + +if _sys.version_info >= (3, 7): + def __getattr__(name): + if name in _deprecated: + obj, new_name = _deprecated[name] + _warnings.warn(_msg.format(name, new_name), + DeprecationWarning, stacklevel=2) + return obj + + raise AttributeError( + "module 'pyarrow' has no attribute '{0}'".format(name) + ) +else: + localfs = _localfs + FileSystem = _FileSystem + LocalFileSystem = _LocalFileSystem + HadoopFileSystem = _HadoopFileSystem + + # Entry point for starting the plasma store diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index bc4c471515d..0831adbd3e7 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -19,12 +19,19 @@ import os import inspect import posixpath +import sys import urllib.parse +import warnings from os.path import join as pjoin import pyarrow as pa -from pyarrow.util import implements, _stringify_path, _is_path_like +from pyarrow.util import implements, _stringify_path, _is_path_like, _DEPR_MSG + + +_FS_DEPR_MSG = _DEPR_MSG.format( + "filesystem.LocalFileSystem", "2.0.0", "fs.LocalFileSystem" +) class FileSystem: @@ -237,12 +244,22 @@ class LocalFileSystem(FileSystem): _instance = None + def __init__(self): + warnings.warn(_FS_DEPR_MSG, DeprecationWarning, stacklevel=2) + super().__init__() + @classmethod - def get_instance(cls): + def _get_instance(cls): if cls._instance is None: - cls._instance = LocalFileSystem() + with warnings.catch_warnings(): + cls._instance = LocalFileSystem() return cls._instance + @classmethod + def get_instance(cls): + warnings.warn(_FS_DEPR_MSG, DeprecationWarning, stacklevel=2) + return cls._get_instance() + @implements(FileSystem.ls) def ls(self, path): path = _stringify_path(path) @@ -431,7 +448,15 @@ def _ensure_filesystem(fs): # In case its a simple LocalFileSystem (e.g. dask) use native arrow # FS elif mro.__name__ == 'LocalFileSystem': - return LocalFileSystem.get_instance() + return LocalFileSystem._get_instance() + + if "fsspec" in sys.modules: + fsspec = sys.modules["fsspec"] + if isinstance(fs, fsspec.AbstractFileSystem): + # for recent fsspec versions that stop inheriting from + # pyarrow.filesystem.FileSystem, still allow fsspec + # filesystems (which should be compatible with our legacy fs) + return fs raise OSError('Unrecognized filesystem: {}'.format(fs_type)) else: @@ -476,15 +501,15 @@ def resolve_filesystem_and_path(where, filesystem=None): port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) - fs = pa.hdfs.connect(host=host, port=port) + fs = pa.hdfs._connect(host=host, port=port) fs_path = parsed_uri.path elif parsed_uri.scheme == 'file': # Input is local URI such as file:///home/user/myfile.parquet - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() fs_path = parsed_uri.path else: # Input is local path such as /home/user/myfile.parquet - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() fs_path = path return fs, fs_path diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py index f4beec37387..eb1b019bf91 100644 --- a/python/pyarrow/hdfs.py +++ b/python/pyarrow/hdfs.py @@ -19,8 +19,9 @@ import os import posixpath import sys +import warnings -from pyarrow.util import implements +from pyarrow.util import implements, _DEPR_MSG from pyarrow.filesystem import FileSystem import pyarrow.lib as lib @@ -34,6 +35,10 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem): def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver='libhdfs', extra_conf=None): + warnings.warn( + _DEPR_MSG.format( + "hdfs.HadoopFileSystem", "2.0.0", "fs.HadoopFileSystem"), + DeprecationWarning, stacklevel=2) if driver == 'libhdfs': _maybe_set_hadoop_classpath() @@ -205,7 +210,21 @@ def connect(host="default", port=0, user=None, kerb_ticket=None, ------- filesystem : HadoopFileSystem """ - fs = HadoopFileSystem(host=host, port=port, user=user, - kerb_ticket=kerb_ticket, - extra_conf=extra_conf) + warnings.warn( + _DEPR_MSG.format("hdfs.connect", "2.0.0", "fs.HadoopFileSystem"), + DeprecationWarning, stacklevel=2 + ) + return _connect( + host=host, port=port, user=user, kerb_ticket=kerb_ticket, + extra_conf=extra_conf + ) + + +def _connect(host="default", port=0, user=None, kerb_ticket=None, + extra_conf=None): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + fs = HadoopFileSystem(host=host, port=port, user=user, + kerb_ticket=kerb_ticket, + extra_conf=extra_conf) return fs diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index b6dc6a8bd3b..14d77fe0d21 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1348,7 +1348,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): manifest = ParquetManifest(path_or_paths, filesystem=fs, open_file_func=open_file_func, - pathsep=fs.pathsep, + pathsep=getattr(fs, "pathsep", "/"), metadata_nthreads=metadata_nthreads) common_metadata_path = manifest.common_metadata_path metadata_path = manifest.metadata_path diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py index 4a6606ff51a..b859a4353ee 100644 --- a/python/pyarrow/tests/test_filesystem.py +++ b/python/pyarrow/tests/test_filesystem.py @@ -15,8 +15,38 @@ # specific language governing permissions and limitations # under the License. +import sys + +import pyarrow as pa from pyarrow import filesystem +import pytest + + +def test_filesystem_deprecated(): + with pytest.warns(DeprecationWarning): + filesystem.LocalFileSystem() + + with pytest.warns(DeprecationWarning): + filesystem.LocalFileSystem.get_instance() + + +@pytest.mark.skipif(sys.version_info < (3, 7), + reason="getattr needs Python 3.7") +def test_filesystem_deprecated_toplevel(): + + with pytest.warns(DeprecationWarning): + pa.localfs + + with pytest.warns(DeprecationWarning): + pa.FileSystem + + with pytest.warns(DeprecationWarning): + pa.LocalFileSystem + + with pytest.warns(DeprecationWarning): + pa.HadoopFileSystem + def test_resolve_uri(): uri = "file:///home/user/myfile.parquet" diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 2110be59069..c048f6557ae 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -46,7 +46,8 @@ def hdfs_test_client(): raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' 'an integer') - return pa.hdfs.connect(host, port, user) + with pytest.warns(DeprecationWarning): + return pa.hdfs.connect(host, port, user) @pytest.mark.hdfs diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b2026f88599..36e0ff67cec 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -499,7 +499,7 @@ def test_multiple_path_types(tempdir, use_legacy_dataset): @parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ - None, fs.LocalFileSystem(), LocalFileSystem.get_instance() + None, fs.LocalFileSystem(), LocalFileSystem._get_instance() ]) def test_relative_paths(tempdir, use_legacy_dataset, filesystem): # reading and writing from relative paths @@ -1712,13 +1712,13 @@ def test_partition_set_dictionary_type(): @pytest.mark.pandas @parametrize_legacy_dataset def test_read_partitioned_directory(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() _partition_test_for_filesystem(fs, tempdir, use_legacy_dataset) @pytest.mark.pandas def test_create_parquet_dataset_multi_threaded(tempdir): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir _partition_test_for_filesystem(fs, base_path) @@ -1738,7 +1738,7 @@ def test_create_parquet_dataset_multi_threaded(tempdir): def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): # ARROW-3861 - do not include partition columns in resulting table when # `columns` keyword was passed without those columns - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir _partition_test_for_filesystem(fs, base_path) @@ -1757,7 +1757,7 @@ def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset def test_filters_equivalency(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1] @@ -1845,7 +1845,7 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1, 2, 3, 4] @@ -1887,7 +1887,7 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): reason='Loss of type information in creation of categoricals.' ) def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir date_keys = [ @@ -1932,7 +1932,7 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset def test_filters_inclusive_integer(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1, 2, 3, 4] @@ -1968,7 +1968,7 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset def test_filters_inclusive_set(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1] @@ -2006,7 +2006,7 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1, 2, 3, 4] @@ -2054,7 +2054,7 @@ def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): def test_filters_invalid_column(tempdir, use_legacy_dataset): # ARROW-5572 - raise error on invalid name in filter specification # works with new dataset / xfail with legacy implementation - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1, 2, 3, 4] @@ -2079,7 +2079,7 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): @parametrize_legacy_dataset def test_filters_read_table(tempdir, use_legacy_dataset): # test that filters keyword is passed through in read_table - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir integer_keys = [0, 1, 2, 3, 4] @@ -2116,7 +2116,7 @@ def test_filters_read_table(tempdir, use_legacy_dataset): def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): # ARROW-5666 - partition field values with underscores preserve underscores # xfail with legacy dataset -> they get interpreted as integers - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() base_path = tempdir string_keys = ["2019_2", "2019_3"] @@ -2311,13 +2311,13 @@ def _test_read_common_metadata_files(fs, base_path): @pytest.mark.pandas def test_read_common_metadata_files(tempdir): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() _test_read_common_metadata_files(fs, tempdir) @pytest.mark.pandas def test_read_metadata_files(tempdir): - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() N = 100 df = pd.DataFrame({ @@ -2426,7 +2426,7 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) - result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) + result3 = pq.ParquetDataset(dirpath, schema=metadata.schema).read() assert result3.equals(expected) else: with pytest.raises(ValueError, match="no longer supported"): @@ -2436,14 +2436,18 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): to_read = [0, 2, 6, result.num_columns - 1] col_names = [result.field(i).name for i in to_read] - out = pa.localfs.read_parquet(dirpath, columns=col_names) + out = pq.read_table( + dirpath, columns=col_names, use_legacy_dataset=use_legacy_dataset + ) expected = pa.Table.from_arrays([result.column(i) for i in to_read], names=col_names, metadata=result.schema.metadata) assert out.equals(expected) # Read with multiple threads - pa.localfs.read_parquet(dirpath, use_threads=True) + pq.read_table( + dirpath, use_threads=True, use_legacy_dataset=use_legacy_dataset + ) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] @@ -2892,7 +2896,7 @@ def _test_write_to_dataset_no_partitions(base_path, output_table = pa.Table.from_pandas(output_df) if filesystem is None: - filesystem = LocalFileSystem.get_instance() + filesystem = LocalFileSystem._get_instance() # Without partitions, append files to root_path n = 5 @@ -3315,7 +3319,7 @@ def test_backwards_compatible_column_metadata_handling( # TODO(dataset) support pickling def _make_dataset_for_pickling(tempdir, N=100): path = tempdir / 'data.parquet' - fs = LocalFileSystem.get_instance() + fs = LocalFileSystem._get_instance() df = pd.DataFrame({ 'index': np.arange(N), @@ -3552,7 +3556,7 @@ def test_parquet_file_pass_directory_instead_of_file(tempdir): @pytest.mark.pandas @pytest.mark.parametrize("filesystem", [ None, - LocalFileSystem.get_instance(), + LocalFileSystem._get_instance(), fs.LocalFileSystem(), ]) def test_parquet_writer_filesystem_local(tempdir, filesystem): diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 690ba3f1fc1..e91294a3a1b 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -24,6 +24,11 @@ import warnings +_DEPR_MSG = ( + "pyarrow.{} is deprecated as of {}, please use pyarrow.{} instead." +) + + def implements(f): def decorator(g): g.__doc__ = f.__doc__ @@ -32,8 +37,7 @@ def decorator(g): def _deprecate_api(old_name, new_name, api, next_version): - msg = ('pyarrow.{} is deprecated as of {}, please use pyarrow.{} instead' - .format(old_name, next_version, new_name)) + msg = _DEPR_MSG.format(old_name, next_version, new_name) def wrapper(*args, **kwargs): warnings.warn(msg, FutureWarning) @@ -46,13 +50,12 @@ def _deprecate_class(old_name, new_class, next_version, """ Raise warning if a deprecated class is used in an isinstance check. """ - msg = 'pyarrow.{} is deprecated as of {}, please use pyarrow.{} instead' - class _DeprecatedMeta(type): def __instancecheck__(self, other): warnings.warn( - msg.format(old_name, next_version, new_class.__name__), - FutureWarning + _DEPR_MSG.format(old_name, next_version, new_class.__name__), + FutureWarning, + stacklevel=2 ) return isinstance(other, new_class)