From 2e854426415719095ae48657beda70042345e009 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 20 Feb 2019 16:27:55 +0100 Subject: [PATCH] ARROW-4559: [Python] Allow Parquet files with special characters in their names --- python/pyarrow/filesystem.py | 14 ++++++++-- python/pyarrow/tests/test_filesystem.py | 37 +++++++++++++++++++++++++ python/pyarrow/tests/test_parquet.py | 11 ++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 python/pyarrow/tests/test_filesystem.py diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 64148d34053..98fb7737047 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -399,7 +399,8 @@ def _ensure_filesystem(fs): def resolve_filesystem_and_path(where, filesystem=None): """ - return filesystem from path which could be an HDFS URI + Return filesystem from path which could be an HDFS URI, a local URI, + or a plain filesystem path. """ if not _is_path_like(where): if filesystem is not None: @@ -407,7 +408,6 @@ def resolve_filesystem_and_path(where, filesystem=None): " there is nothing to open with filesystem.") return filesystem, where - # input can be hdfs URI such as hdfs://host:port/myfile.parquet path = _stringify_path(where) if filesystem is not None: @@ -415,6 +415,7 @@ def resolve_filesystem_and_path(where, filesystem=None): parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': + # Input is hdfs URI such as hdfs://host:port/myfile.parquet netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': @@ -423,7 +424,14 @@ def resolve_filesystem_and_path(where, filesystem=None): if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = pa.hdfs.connect(host=host, port=port) + fs_path = parsed_uri.path + elif parsed_uri.scheme == 'file': + # Input is local URI such as file:///home/user/myfile.parquet + fs = LocalFileSystem.get_instance() + fs_path = parsed_uri.path else: + # Input is local path such as /home/user/myfile.parquet fs = LocalFileSystem.get_instance() + fs_path = where - return fs, parsed_uri.path + return fs, fs_path diff --git a/python/pyarrow/tests/test_filesystem.py b/python/pyarrow/tests/test_filesystem.py new file mode 100644 index 00000000000..4a6606ff51a --- /dev/null +++ b/python/pyarrow/tests/test_filesystem.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow import filesystem + + +def test_resolve_uri(): + uri = "file:///home/user/myfile.parquet" + fs, path = filesystem.resolve_filesystem_and_path(uri) + assert isinstance(fs, filesystem.LocalFileSystem) + assert path == "/home/user/myfile.parquet" + + +def test_resolve_local_path(): + for uri in ['/home/user/myfile.parquet', + 'myfile.parquet', + 'my # file ? parquet', + 'C:/Windows/myfile.parquet', + r'C:\\Windows\\myfile.parquet', + ]: + fs, path = filesystem.resolve_filesystem_and_path(uri) + assert isinstance(fs, filesystem.LocalFileSystem) + assert path == uri diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5156300b01b..0b93032b495 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -199,6 +199,17 @@ def test_no_memory_map(tempdir): assert table_read.equals(table) +def test_special_chars_filename(tempdir): + table = pa.Table.from_arrays([pa.array([42])], ["ints"]) + filename = "foo # bar" + path = tempdir / filename + assert not path.exists() + _write_table(table, str(path)) + assert path.exists() + table_read = _read_table(str(path)) + assert table_read.equals(table) + + def test_empty_table_roundtrip(): df = alltypes_sample(size=10) # The nanosecond->us conversion is a nuisance, so we just avoid it here