apache · rdblue · Jan 24, 2022 · Dec 13, 2021 · Jan 4, 2022 · Jan 4, 2022
diff --git a/python/src/iceberg/io/base.py b/python/src/iceberg/io/base.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from abc import ABC, abstractmethod
+
+
+class InputFile(ABC):
+    """A base class for InputFile implementations"""
+
+    def __init__(self, location: str):
+        self._location = location
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Returns the total length of the file, in bytes"""
+
+    @property
+    def location(self) -> str:
+        """The fully-qualified location of the input file"""
+        return self._location
+
+    @property
+    @abstractmethod
+    def exists(self) -> bool:
+        """Checks whether the file exists"""
+
+    @abstractmethod
+    def __enter__(self):
+        """Enter context for InputFile
+
+        This method should assign a seekable stream to `self.input_stream` and
+        return `self`. If the file does not exist, a FileNotFoundError should
+        be raised."""
+
+    @abstractmethod
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        """Exit context for InputFile
+
+        This method should perform any necessary teardown."""
+
+
+class OutputFile(ABC):
+    """A base class for OutputFile implementations"""
+
+    def __init__(self, location: str, overwrite: bool = False):
+        self._location = location
+        self._overwrite = overwrite
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Returns the total length of the file, in bytes"""
+
+    @property
+    def location(self) -> str:
+        """The fully-qualified location of the output file"""
+        return self._location
+
+    @property
+    def overwrite(self) -> bool:
+        """Whether or not to overwrite the file if it exists"""
+        return self._overwrite
+
+    @property
+    @abstractmethod
+    def exists(self) -> bool:
+        """Checks whether the file exists"""
+
+    @abstractmethod
+    def to_input_file(self) -> InputFile:
+        """Returns an InputFile for the location of this output file"""
+
+    @abstractmethod
+    def __enter__(self):
+        """Enter context for OutputFile
+
+        This method should return a file-like object. If the file already exists
+        at `self.location` and `self.overwrite` is False a FileExistsError should
+        be raised.
+
+        Example:
+            >>> with OutputFile(overwrite=True) as f:
+                    content = f.read()
+        """
+
+    @abstractmethod
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        """Exit context for OutputFile
+
+        This method should perform any necessary teardown.
+
+        Example:
+            >>> with OutputFile(connection=connection):
+                    content = f.read()
+                    connection.close()  # `__exit__` method would contain `del self._connection`
+        """
+
+
+class FileIO(ABC):
+    @abstractmethod
+    def new_input(self, location: str) -> InputFile:
+        """Get an InputFile instance to read bytes from the file at the given location"""
+
+    @abstractmethod
+    def new_output(self, location: str) -> OutputFile:
+        """Get an OutputFile instance to write bytes to the file at the given location"""
+
+    @abstractmethod
+    def delete(self, location: str):
+        """Delete the file at the given path"""
diff --git a/python/tests/io/test_base.py b/python/tests/io/test_base.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+
+from iceberg.io.base import FileIO, InputFile, OutputFile
+
+
+class FooInputFile(InputFile):
+    def __len__(self):
+        return io.BytesIO(b"foo").getbuffer().nbytes
+
+    def exists(self):
+        return True
+
+    def __enter__(self):
+        super().__enter__()
+        return io.BytesIO(b"foo")
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        super().__exit__(exc_type, exc_value, exc_traceback)
+        return
+
+
+class FooOutputFile(OutputFile):
+    def __call__(self, overwrite: bool = False, **kwargs):
+        super().__call__(overwrite=True)
+        return self
+
+    def __len__(self):
+        return len(self._file_obj)
+
+    def exists(self):
+        return True
+
+    def to_input_file(self):
+        return FooInputFile(location=self.location)
+
+    def __enter__(self):
+        self._mock_storage = io.BytesIO()
+        return self._mock_storage
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        super().__exit__(exc_type, exc_value, exc_traceback)
+        return
+
+
+class FooFileIO(FileIO):
+    def new_input(self, location: str):
+        return FooInputFile(location=location)
+
+    def new_output(self, location: str):
+        return FooOutputFile(location=location)
+
+    def delete(self, location: str):
+        return
+
+
+def test_custom_input_file():
+
+    input_file = FooInputFile(location="foo/bar.json")
+    assert input_file.location == "foo/bar.json"
+
+    with input_file as f:
+        data = f.read()
+
+    assert data == b"foo"
+
+
+def test_custom_output_file():
+
+    output_file = FooOutputFile(location="foo/bar.json")
+    assert output_file.location == "foo/bar.json"
+
+    with output_file as f:
+        f.write(b"foo")
+
+    output_file._mock_storage.seek(0)
+    assert output_file._mock_storage.read() == b"foo"
+
+
+def test_custom_output_file_with_overwrite():
+
+    output_file = FooOutputFile(location="foo/bar.json", overwrite=True)
+    assert output_file.location == "foo/bar.json"
+    assert output_file.overwrite == True
+
+    with output_file as f:
+        f.write(b"foo")
+
+    output_file._mock_storage.seek(0)
+    assert output_file._mock_storage.read() == b"foo"
+
+
+def test_custom_file_io():
+
+    file_io = FooFileIO()
+    input_file = file_io.new_input(location="foo")
+    output_file = file_io.new_output(location="bar")
+
+    assert isinstance(input_file, FooInputFile)
+    assert isinstance(output_file, FooOutputFile)