Skip to content

Commit 8bf51f5

Browse files
committed
Add more filesystem methods, tests for HDFS
Change-Id: I6ca7a89c0cd9219657c4964345ff7664b8c3dd78
1 parent 98847b5 commit 8bf51f5

File tree

10 files changed

+266
-39
lines changed

10 files changed

+266
-39
lines changed

cpp/src/arrow/io/hdfs.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ class HadoopFileSystem::HadoopFileSystemImpl {
495495
}
496496

497497
Status Chmod(const std::string& path, int mode) {
498-
int ret = driver_->Chmod(fs_, path.c_str(), static_cast<short>(mode));
498+
int ret = driver_->Chmod(fs_, path.c_str(), static_cast<short>(mode)); // NOLINT
499499
CHECK_FAILURE(ret, "Chmod");
500500
return Status::OK();
501501
}

python/doc/source/api.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ Input / Output and Shared Memory
165165
PythonFile
166166

167167
File Systems
168-
-----------
168+
------------
169169

170170
.. autosummary::
171171
:toctree: generated/
172172

173173
hdfs.connect
174-
HadoopFileSystem
175-
LocalFilesystem
174+
LocalFileSystem
175+
176+
.. class:: HadoopFileSystem
177+
:noindex:
176178

177179
.. _api.ipc:
178180

python/doc/source/conf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@
6262
]
6363

6464
# Show members for classes in .. autosummary
65-
autodoc_default_flags = ['members', 'undoc-members', 'show-inheritance', 'inherited-members']
65+
autodoc_default_flags = ['members', 'undoc-members', 'show-inheritance',
66+
'inherited-members']
6667

6768
# numpydoc configuration
6869
napoleon_use_rtype = False

python/doc/source/filesystems.rst

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
.. specific language governing permissions and limitations
1616
.. under the License.
1717
18-
Filesystem Interfaces
19-
=====================
18+
File System Interfaces
19+
======================
2020

2121
In this section, we discuss filesystem-like interfaces in PyArrow.
2222

@@ -34,7 +34,7 @@ System. You connect like so:
3434
hdfs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path)
3535
type(hdfs)
3636
37-
By default, ``pyarrow.hdfs.HadoopFilesystem`` uses libhdfs, a JNI-based
37+
By default, ``pyarrow.hdfs.HadoopFileSystem`` uses libhdfs, a JNI-based
3838
interface to the Java Hadoop client. This library is loaded **at runtime**
3939
(rather than at link / library load time, since the library may not be in your
4040
LD_LIBRARY_PATH), and relies on some environment variables.
@@ -69,16 +69,20 @@ HDFS API
6969
:toctree: generated/
7070

7171
hdfs.connect
72-
HadoopFilesystem
73-
HadoopFilesystem.cat
74-
HadoopFilesystem.chmod
75-
HadoopFilesystem.chown
76-
HadoopFilesystem.delete
77-
HadoopFilesystem.download
78-
HadoopFilesystem.exists
79-
HadoopFilesystem.info
80-
HadoopFilesystem.ls
81-
HadoopFilesystem.mkdir
82-
HadoopFilesystem.rm
83-
HadoopFilesystem.upload
72+
HadoopFileSystem.cat
73+
HadoopFileSystem.chmod
74+
HadoopFileSystem.chown
75+
HadoopFileSystem.delete
76+
HadoopFileSystem.df
77+
HadoopFileSystem.disk_usage
78+
HadoopFileSystem.download
79+
HadoopFileSystem.exists
80+
HadoopFileSystem.get_capacity
81+
HadoopFileSystem.get_space_used
82+
HadoopFileSystem.info
83+
HadoopFileSystem.ls
84+
HadoopFileSystem.mkdir
85+
HadoopFileSystem.rename
86+
HadoopFileSystem.rm
87+
HadoopFileSystem.upload
8488
HdfsFile

python/doc/source/memory.rst

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,3 @@ file interfaces that can read and write to Arrow Buffers.
226226
reader.read(7)
227227
228228
These have similar semantics to Python's built-in ``io.BytesIO``.
229-
230-
Hadoop Filesystem
231-
-----------------
232-
233-
:class:`~pyarrow.HdfsFile` is an implementation of :class:`~pyarrow.NativeFile`
234-
that can read and write to the Hadoop filesytem. Read more in the
235-
:ref:`Filesystems Section <hdfs>`.

python/pyarrow/filesystem.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,17 @@ class FileSystem(object):
2626
"""
2727
Abstract filesystem interface
2828
"""
29+
def cat(self, path):
30+
"""
31+
Return contents of file as a bytes object
32+
33+
Returns
34+
-------
35+
contents : bytes
36+
"""
37+
with self.open(path, 'rb') as f:
38+
return f.read()
39+
2940
def ls(self, path):
3041
"""
3142
Return list of file paths
@@ -44,12 +55,68 @@ def delete(self, path, recursive=False):
4455
"""
4556
raise NotImplementedError
4657

58+
def disk_usage(self, path):
59+
"""
60+
Compute bytes used by all contents under indicated path in file tree
61+
62+
Parameters
63+
----------
64+
path : string
65+
Can be a file path or directory
66+
67+
Returns
68+
-------
69+
usage : int
70+
"""
71+
path_info = self.stat(path)
72+
if path_info['kind'] == 'file':
73+
return path_info['size']
74+
75+
total = 0
76+
for root, directories, files in self.walk(path):
77+
for child_path in files:
78+
abspath = self._path_join(root, child_path)
79+
total += self.stat(abspath)['size']
80+
81+
return total
82+
83+
def _path_join(self, *args):
84+
return self.pathsep.join(args)
85+
86+
def stat(self, path):
87+
"""
88+
89+
Returns
90+
-------
91+
stat : dict
92+
"""
93+
raise NotImplementedError('FileSystem.stat')
94+
4795
def rm(self, path, recursive=False):
4896
"""
4997
Alias for FileSystem.delete
5098
"""
5199
return self.delete(path, recursive=recursive)
52100

101+
def mv(self, path, new_path):
102+
"""
103+
Alias for FileSystem.rename
104+
"""
105+
return self.rename(path, new_path)
106+
107+
def rename(self, path, new_path):
108+
"""
109+
Rename file, like UNIX mv command
110+
111+
Parameters
112+
----------
113+
path : string
114+
Path to alter
115+
new_path : string
116+
Path to move to
117+
"""
118+
raise NotImplementedError('FileSystem.rename')
119+
53120
def mkdir(self, path, create_parents=True):
54121
raise NotImplementedError
55122

python/pyarrow/hdfs.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def delete(self, path, recursive=False):
4848
def mkdir(self, path, create_parents=True):
4949
return super(HadoopFileSystem, self).mkdir(path)
5050

51+
@implements(FileSystem.rename)
52+
def rename(self, path, new_path):
53+
return super(HadoopFileSystem, self).rename(path, new_path)
54+
5155
def ls(self, path, detail=False):
5256
"""
5357
Retrieve directory contents and metadata, if requested.
@@ -82,7 +86,7 @@ def walk(self, top_path):
8286
directories, files = _libhdfs_walk_files_dirs(top_path, contents)
8387
yield top_path, directories, files
8488
for dirname in directories:
85-
for tup in self.walk(dirname):
89+
for tup in self.walk(self._path_join(top_path, dirname)):
8690
yield tup
8791

8892

python/pyarrow/includes/libarrow.pxd

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,10 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil:
413413
ObjectType_FILE" arrow::io::ObjectType::FILE"
414414
ObjectType_DIRECTORY" arrow::io::ObjectType::DIRECTORY"
415415

416+
cdef cppclass FileStatistics:
417+
int64_t size
418+
ObjectType kind
419+
416420
cdef cppclass FileInterface:
417421
CStatus Close()
418422
CStatus Tell(int64_t* position)
@@ -450,6 +454,9 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil:
450454
WriteableFile):
451455
pass
452456

457+
cdef cppclass FileSystem:
458+
CStatus Stat(const c_string& path, FileStatistics* stat)
459+
453460

454461
cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil:
455462

@@ -517,7 +524,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil:
517524
cdef cppclass HdfsOutputStream(OutputStream):
518525
pass
519526

520-
cdef cppclass CHadoopFileSystem" arrow::io::HadoopFileSystem":
527+
cdef cppclass CHadoopFileSystem" arrow::io::HadoopFileSystem"(FileSystem):
521528
@staticmethod
522529
CStatus Connect(const HdfsConnectionConfig* config,
523530
shared_ptr[CHadoopFileSystem]* client)

python/pyarrow/io-hdfs.pxi

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,48 @@ cdef class HadoopFileSystem:
129129
self._path_info(path, &info)
130130
return info.kind == ObjectType_FILE
131131

132+
def get_capacity(self):
133+
"""
134+
Get reported total capacity of file system
135+
136+
Returns
137+
-------
138+
capacity : int
139+
"""
140+
cdef int64_t capacity = 0
141+
with nogil:
142+
check_status(self.client.get().GetCapacity(&capacity))
143+
return capacity
144+
145+
def get_space_used(self):
146+
"""
147+
Get space used on file system
148+
149+
Returns
150+
-------
151+
space_used : int
152+
"""
153+
cdef int64_t space_used = 0
154+
with nogil:
155+
check_status(self.client.get().GetUsed(&space_used))
156+
return space_used
157+
158+
def df(self):
159+
"""
160+
Return free space on disk, like the UNIX df command
161+
162+
Returns
163+
-------
164+
space : int
165+
"""
166+
return self.get_capacity() - self.get_space_used()
167+
168+
def rename(self, path, new_path):
169+
cdef c_string c_path = tobytes(path)
170+
cdef c_string c_new_path = tobytes(new_path)
171+
with nogil:
172+
check_status(self.client.get().Rename(c_path, c_new_path))
173+
132174
def info(self, path):
133175
"""
134176
Return detailed HDFS information for path
@@ -158,6 +200,30 @@ cdef class HadoopFileSystem:
158200
else 'file')
159201
}
160202

203+
def stat(self, path):
204+
"""
205+
Return basic file system statistics about path
206+
207+
Parameters
208+
----------
209+
path : string
210+
Path to file or directory
211+
212+
Returns
213+
-------
214+
stat : dict
215+
"""
216+
cdef FileStatistics info
217+
cdef c_string c_path = tobytes(path)
218+
with nogil:
219+
check_status(self.client.get()
220+
.Stat(c_path, &info))
221+
return {
222+
'size': info.size,
223+
'kind': ('directory' if info.kind == ObjectType_DIRECTORY
224+
else 'file')
225+
}
226+
161227
cdef _path_info(self, path, HdfsPathInfo* info):
162228
cdef c_string c_path = tobytes(path)
163229

@@ -290,9 +356,16 @@ cdef class HadoopFileSystem:
290356
def open(self, path, mode='rb', buffer_size=None, replication=None,
291357
default_block_size=None):
292358
"""
359+
Open HDFS file for reading or writing
360+
293361
Parameters
294362
----------
295-
mode : string, 'rb', 'wb', 'ab'
363+
mode : string
364+
Must be one of 'rb', 'wb', 'ab'
365+
366+
Returns
367+
-------
368+
handle : HdfsFile
296369
"""
297370
self._ensure_client()
298371

0 commit comments

Comments
 (0)