databricks · sueann · Dec 19, 2017 · Nov 30, 2017 · Dec 5, 2017 · Dec 5, 2017
diff --git a/README.md b/README.md
@@ -80,14 +80,23 @@ To try running the examples below, check out the Databricks notebook [DeepLearni
 
 ### Working with images in Spark
 
-The first step to applying deep learning on images is the ability to load the images. Deep Learning Pipelines includes utility functions that can load millions of images into a Spark DataFrame and decode them automatically in a distributed fashion, allowing manipulation at scale.
+The first step to applying deep learning on images is the ability to load the images. Spark and Deep Learning Pipelines include utility functions that can load millions of images into a Spark DataFrame and decode them automatically in a distributed fashion, allowing manipulation at scale.
+
+Using Spark's ImageSchema
+
+```python
+from pyspark.ml.image import ImageSchema
+image_df = ImageSchema.readImages("/data/myimages")
+```
+
+or if custom image library is needed:
 
 ```python
-from sparkdl import readImages
-image_df = readImages("/data/myimages")
+from sparkdl.image import imageIO as imageIO
+image_df = imageIO.readImagesWithCustomFn("/data/myimages",decode_f=<your image library, see imageIO.PIL_decode>)
 ```
 
-The resulting DataFrame contains a string column named "filePath" containing the path to each image file, and a image struct ("`SpImage`") column named "image" containing the decoded image data.
+The resulting DataFrame contains a string column named "image" containing an image struct with schema == ImageSchema.
 
 ```python
 image_df.show()
@@ -109,7 +118,7 @@ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelNa
 lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
 p = Pipeline(stages=[featurizer, lr])
 
-model = p.fit(train_images_df)    # train_images_df is a dataset of images (SpImage) and labels
+model = p.fit(train_images_df)    # train_images_df is a dataset of images and labels
 
 # Inspect training error
 df = model.transform(train_images_df.limit(10)).select("image", "probability",  "uri", "label")
@@ -127,11 +136,13 @@ Spark DataFrames are a natural construct for applying deep learning models to a
     There are many well-known deep learning models for images. If the task at hand is very similar to what the models provide (e.g. object recognition with ImageNet classes), or for pure exploration, one can use the Transformer `DeepImagePredictor` by simply specifying the model name.
 
     ```python
-    from sparkdl import readImages, DeepImagePredictor
+    from pyspark.ml.image import ImageSchema
+
+    from sparkdl import DeepImagePredictor
 
     predictor = DeepImagePredictor(inputCol="image", outputCol="predicted_labels",
                                    modelName="InceptionV3", decodePredictions=True, topK=10)
-    image_df = readImages("/data/myimages")
+    image_df = ImageSchema.readImages("/data/myimages")
     predictions_df = predictor.transform(image_df)
     ```
 
@@ -140,7 +151,8 @@ Spark DataFrames are a natural construct for applying deep learning models to a
     Deep Learning Pipelines provides a Transformer that will apply the given TensorFlow Graph to a DataFrame containing a column of images (e.g. loaded using the utilities described in the previous section). Here is a very simple example of how a TensorFlow Graph can be used with the Transformer. In practice, the TensorFlow Graph will likely be restored from files before calling `TFImageTransformer`.
 
     ```python
-    from sparkdl import readImages, TFImageTransformer
+    from pyspark.ml.image import ImageSchema
+    from sparkdl import TFImageTransformer
     import sparkdl.graph.utils as tfx
     from sparkdl.transformers import utils
     import tensorflow as tf
@@ -155,7 +167,7 @@ Spark DataFrames are a natural construct for applying deep learning models to a
     transformer = TFImageTransformer(inputCol="image", outputCol="predictions", graph=frozen_graph,
                                      inputTensor=image_arr, outputTensor=resized_images,
                                      outputMode="image")
-    image_df = readImages("/data/myimages")
+    image_df = ImageSchema.readImages("/data/myimages")
     processed_image_df = transformer.transform(image_df)
     ```
 

diff --git a/build.sbt b/build.sbt
@@ -35,7 +35,7 @@ sparkComponents ++= Seq("mllib-local", "mllib", "sql")
 // add any Spark Package dependencies using spDependencies.
 // e.g. spDependencies += "databricks/spark-avro:0.1"
 spDependencies += s"databricks/tensorframes:0.2.9-s_${scalaMajorVersion}"
-spDependencies += "Microsoft/spark-images:0.1"
+
 
 // These versions are ancient, but they cross-compile around scala 2.10 and 2.11.
 // Update them when dropping support for scala 2.10

diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,8 +1,5 @@
 // You may use this file to add plugin dependencies for sbt.
 resolvers += "Spark Packages repo" at "https://dl.bintray.com/spark-packages/maven/"
-
 addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.5")
-
 // scalacOptions in (Compile,doc) := Seq("-groups", "-implicits")
-
 addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0")
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
@@ -0,0 +1,221 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: This file is copied from Spark2.3 in order to be able to use this in already released spark versions.
+# TODO: remove this when Spark 2.3 is out!
+
+"""
+.. attribute:: ImageSchema
+
+    An attribute of this module that contains the instance of :class:`_ImageSchema`.
+
+.. autoclass:: _ImageSchema
+   :members:
+"""
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string
+from pyspark.sql import DataFrame, SparkSession
+
+
+class _ImageSchema(object):
+    """
+    Internal class for `pyspark.ml.image.ImageSchema` attribute. Meant to be private and
+    not to be instantized. Use `pyspark.ml.image.ImageSchema` attribute to access the
+    APIs of this class.
+    """
+
+    def __init__(self):
+        self._imageSchema = None
+        self._ocvTypes = None
+        self._imageFields = None
+        self._undefinedImageType = None
+
+    @property
+    def imageSchema(self):
+        """
+        Returns the image schema.
+
+        :return: a :class:`StructType` with a single column of images
+               named "image" (nullable).
+
+        .. versionadded:: 2.3.0
+        """
+
+        if self._imageSchema is None:
+            ctx = SparkContext._active_spark_context
+            jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema()
+            self._imageSchema = _parse_datatype_json_string(jschema.json())
+        return self._imageSchema
+
+    @property
+    def ocvTypes(self):
+        """
+        Returns the OpenCV type mapping supported.
+
+        :return: a dictionary containing the OpenCV type mapping supported.
+
+        .. versionadded:: 2.3.0
+        """
+
+        if self._ocvTypes is None:
+            ctx = SparkContext._active_spark_context
+            self._ocvTypes = dict(ctx._jvm.org.apache.spark.ml.image.ImageSchema.javaOcvTypes())
+        return self._ocvTypes
+
+    @property
+    def imageFields(self):
+        """
+        Returns field names of image columns.
+
+        :return: a list of field names.
+
+        .. versionadded:: 2.3.0
+        """
+
+        if self._imageFields is None:
+            ctx = SparkContext._active_spark_context
+            self._imageFields = list(ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageFields())
+        return self._imageFields
+
+    @property
+    def undefinedImageType(self):
+        """
+        Returns the name of undefined image type for the invalid image.
+
+        .. versionadded:: 2.3.0
+        """
+
+        if self._undefinedImageType is None:
+            ctx = SparkContext._active_spark_context
+            self._undefinedImageType = \
+                ctx._jvm.org.apache.spark.ml.image.ImageSchema.undefinedImageType()
+        return self._undefinedImageType
+
+    def toNDArray(self, image):
+        """
+        Converts an image to an array with metadata.
+
+        :param `Row` image: A row that contains the image to be converted. It should
+            have the attributes specified in `ImageSchema.imageSchema`.
+        :return: a `numpy.ndarray` that is an image.
+
+        .. versionadded:: 2.3.0
+        """
+
+        if not isinstance(image, Row):
+            raise TypeError(
+                "image argument should be pyspark.sql.types.Row; however, "
+                "it got [%s]." % type(image))
+
+        if any(not hasattr(image, f) for f in self.imageFields):
+            raise ValueError(
+                "image argument should have attributes specified in "
+                "ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields))
+
+        height = image.height
+        width = image.width
+        nChannels = image.nChannels
+        return np.ndarray(
+            shape=(height, width, nChannels),
+            dtype=np.uint8,
+            buffer=image.data,
+            strides=(width * nChannels, nChannels, 1))
+
+    def toImage(self, array, origin=""):
+        """
+        Converts an array with metadata to a two-dimensional image.
+
+        :param `numpy.ndarray` array: The array to convert to image.
+        :param str origin: Path to the image, optional.
+        :return: a :class:`Row` that is a two dimensional image.
+
+        .. versionadded:: 2.3.0
+        """
+
+        if not isinstance(array, np.ndarray):
+            raise TypeError(
+                "array argument should be numpy.ndarray; however, it got [%s]." % type(array))
+
+        if array.ndim != 3:
+            raise ValueError("Invalid array shape")
+
+        height, width, nChannels = array.shape
+        ocvTypes = ImageSchema.ocvTypes
+        if nChannels == 1:
+            mode = ocvTypes["CV_8UC1"]
+        elif nChannels == 3:
+            mode = ocvTypes["CV_8UC3"]
+        elif nChannels == 4:
+            mode = ocvTypes["CV_8UC4"]
+        else:
+            raise ValueError("Invalid number of channels")
+
+        # Running `bytearray(numpy.array([1]))` fails in specific Python versions
+        # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
+        # Here, it avoids it by converting it to bytes.
+        data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
+
+        # Creating new Row with _create_row(), because Row(name = value, ... )
+        # orders fields by name, which conflicts with expected schema order
+        # when the new DataFrame is created by UDF
+        return _create_row(self.imageFields,
+                           [origin, height, width, nChannels, mode, data])
+
+    def readImages(self, path, recursive=False, numPartitions=-1,
+                   dropImageFailures=False, sampleRatio=1.0, seed=0):
+        """
+        Reads the directory of images from the local or remote source.
+
+        .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,
+            there may be a race condition where one job overwrites the hadoop configs of another.
+
+        .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but
+            potentially non-deterministic.
+
+        :param str path: Path to the image directory.
+        :param bool recursive: Recursive search flag.
+        :param int numPartitions: Number of DataFrame partitions.
+        :param bool dropImageFailures: Drop the files that are not valid images.
+        :param float sampleRatio: Fraction of the images loaded.
+        :param int seed: Random number seed.
+        :return: a :class:`DataFrame` with a single column of "images",
+               see ImageSchema for details.
+
+        >>> df = ImageSchema.readImages('python/test_support/image/kittens', recursive=True)
+        >>> df.count()
+        4
+
+        .. versionadded:: 2.3.0
+        """
+
+        ctx = SparkContext._active_spark_context
+        spark = SparkSession(ctx)
+        image_schema = ctx._jvm.org.apache.spark.ml.image.ImageSchema
+        jsession = spark._jsparkSession
+        jresult = image_schema.readImages(path, jsession, recursive, numPartitions,
+                                          dropImageFailures, float(sampleRatio), seed)
+        return DataFrame(jresult, spark._wrapped)
+
+
+ImageSchema = _ImageSchema()
+
+
+# Monkey patch to disallow instantization of this class.
+def _disallow_instance(_):
+    raise RuntimeError("Creating instance of _ImageSchema class is disallowed.")
+_ImageSchema.__init__ = _disallow_instance
diff --git a/python/sparkdl/__init__.py b/python/sparkdl/__init__.py
@@ -13,8 +13,18 @@
 # limitations under the License.
 #
 
+
+# hack to import copy-pasted image schema (to be removed in Spark2.3)
+# TODO remove in Spark2.3
+import os
+import pyspark.ml
+dir_path = os.path.dirname(os.path.realpath(__file__))
+parentdir = os.path.dirname(dir_path)
+pyspark.ml.__path__.append(os.path.join(parentdir, "pyspark", "ml"))
+
+from pyspark.ml.image import ImageSchema
+
 from .graph.input import TFInputGraph
-from .image.imageIO import imageSchema, imageType, readImages
 from .transformers.keras_image import KerasImageFileTransformer
 from .transformers.named_image import DeepImagePredictor, DeepImageFeaturizer
 from .transformers.tf_image import TFImageTransformer

diff --git a/python/sparkdl/graph/pieces.py b/python/sparkdl/graph/pieces.py
@@ -18,7 +18,6 @@
 import tensorflow as tf
 
 from sparkdl.graph.builder import IsolatedSession
-from sparkdl.image.imageIO import SparkMode
 
 logger = logging.getLogger('sparkdl')
 
@@ -48,14 +47,13 @@ def buildSpImageConverter(img_dtype):
         # This is the default behavior of Python Image Library
         shape = tf.reshape(tf.stack([height, width, num_channels], axis=0),
                            shape=(3,), name='shape')
-        if img_dtype == SparkMode.RGB:
+        if img_dtype == 'uint8':
             image_uint8 = tf.decode_raw(image_buffer, tf.uint8, name="decode_raw")
             image_float = tf.to_float(image_uint8)
-        else:
-            assert img_dtype == SparkMode.RGB_FLOAT32, \
-                "Unsupported dtype for image: {}".format(img_dtype)
+        elif img_dtype == 'float32':
             image_float = tf.decode_raw(image_buffer, tf.float32, name="decode_raw")
-
+        else:
+            raise ValueError('unsupported image data type "%s", currently only know how to handle uint8 and float32' % img_dtype)
         image_reshaped = tf.reshape(image_float, shape, name="reshaped")
         image_input = tf.expand_dims(image_reshaped, 0, name="image_input")
         gfn = issn.asGraphFunction([height, width, image_buffer, num_channels], [image_input])