Python.

rxin · rxin · commit aeaaeb4a6f4d · 2016-01-03T21:30:22.000-08:00
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -65,7 +65,7 @@ def deco(f):
 
 
 # for back compatibility
-from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row
+from pyspark.sql import SQLContext, HiveContext, Row
 
 __all__ = [
     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
@@ -47,7 +47,7 @@
 from pyspark.sql.types import Row
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.column import Column
-from pyspark.sql.dataframe import DataFrame, SchemaRDD, DataFrameNaFunctions, DataFrameStatFunctions
+from pyspark.sql.dataframe import DataFrame, DataFrameNaFunctions, DataFrameStatFunctions
 from pyspark.sql.group import GroupedData
 from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
 from pyspark.sql.window import Window, WindowSpec
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -27,8 +27,7 @@
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.types import *
 
-__all__ = ["DataFrame", "Column", "SchemaRDD", "DataFrameNaFunctions",
-           "DataFrameStatFunctions"]
+__all__ = ["DataFrame", "Column", "DataFrameNaFunctions", "DataFrameStatFunctions"]
 
 
 def _create_column_from_literal(literal):
@@ -272,23 +271,6 @@ def substr(self, startPos, length):
 
     __getslice__ = substr
 
-    @ignore_unicode_prefix
-    @since(1.3)
-    def inSet(self, *cols):
-        """
-        A boolean expression that is evaluated to true if the value of this
-        expression is contained by the evaluated values of the arguments.
-
-        >>> df[df.name.inSet("Bob", "Mike")].collect()
-        [Row(age=5, name=u'Bob')]
-        >>> df[df.age.inSet([1, 2, 3])].collect()
-        [Row(age=2, name=u'Alice')]
-
-        .. note:: Deprecated in 1.5, use :func:`Column.isin` instead.
-        """
-        warnings.warn("inSet is deprecated. Use isin() instead.")
-        return self.isin(*cols)
-
     @ignore_unicode_prefix
     @since(1.5)
     def isin(self, *cols):
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -274,33 +274,6 @@ def _inferSchema(self, rdd, samplingRatio=None):
             schema = rdd.map(_infer_schema).reduce(_merge_type)
         return schema
 
-    @ignore_unicode_prefix
-    def inferSchema(self, rdd, samplingRatio=None):
-        """
-        .. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
-        """
-        warnings.warn("inferSchema is deprecated, please use createDataFrame instead.")
-
-        if isinstance(rdd, DataFrame):
-            raise TypeError("Cannot apply schema to DataFrame")
-
-        return self.createDataFrame(rdd, None, samplingRatio)
-
-    @ignore_unicode_prefix
-    def applySchema(self, rdd, schema):
-        """
-        .. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
-        """
-        warnings.warn("applySchema is deprecated, please use createDataFrame instead")
-
-        if isinstance(rdd, DataFrame):
-            raise TypeError("Cannot apply schema to DataFrame")
-
-        if not isinstance(schema, StructType):
-            raise TypeError("schema should be StructType, but got %s" % type(schema))
-
-        return self.createDataFrame(rdd, schema)
-
     def _createFromRDD(self, rdd, schema, samplingRatio):
         """
         Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
@@ -450,90 +423,6 @@ def dropTempTable(self, tableName):
         """
         self._ssql_ctx.dropTempTable(tableName)
 
-    def parquetFile(self, *paths):
-        """Loads a Parquet file, returning the result as a :class:`DataFrame`.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.parquet` instead.
-
-        >>> sqlContext.parquetFile('python/test_support/sql/parquet_partitioned').dtypes
-        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
-        """
-        warnings.warn("parquetFile is deprecated. Use read.parquet() instead.")
-        gateway = self._sc._gateway
-        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
-        for i in range(0, len(paths)):
-            jpaths[i] = paths[i]
-        jdf = self._ssql_ctx.parquetFile(jpaths)
-        return DataFrame(jdf, self)
-
-    def jsonFile(self, path, schema=None, samplingRatio=1.0):
-        """Loads a text file storing one JSON object per line as a :class:`DataFrame`.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.json` instead.
-
-        >>> sqlContext.jsonFile('python/test_support/sql/people.json').dtypes
-        [('age', 'bigint'), ('name', 'string')]
-        """
-        warnings.warn("jsonFile is deprecated. Use read.json() instead.")
-        if schema is None:
-            df = self._ssql_ctx.jsonFile(path, samplingRatio)
-        else:
-            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
-            df = self._ssql_ctx.jsonFile(path, scala_datatype)
-        return DataFrame(df, self)
-
-    @ignore_unicode_prefix
-    @since(1.0)
-    def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
-        """Loads an RDD storing one JSON object per string as a :class:`DataFrame`.
-
-        If the schema is provided, applies the given schema to this JSON dataset.
-        Otherwise, it samples the dataset with ratio ``samplingRatio`` to determine the schema.
-
-        >>> df1 = sqlContext.jsonRDD(json)
-        >>> df1.first()
-        Row(field1=1, field2=u'row1', field3=Row(field4=11, field5=None), field6=None)
-
-        >>> df2 = sqlContext.jsonRDD(json, df1.schema)
-        >>> df2.first()
-        Row(field1=1, field2=u'row1', field3=Row(field4=11, field5=None), field6=None)
-
-        >>> from pyspark.sql.types import *
-        >>> schema = StructType([
-        ...     StructField("field2", StringType()),
-        ...     StructField("field3",
-        ...                 StructType([StructField("field5", ArrayType(IntegerType()))]))
-        ... ])
-        >>> df3 = sqlContext.jsonRDD(json, schema)
-        >>> df3.first()
-        Row(field2=u'row1', field3=Row(field5=None))
-        """
-
-        def func(iterator):
-            for x in iterator:
-                if not isinstance(x, basestring):
-                    x = unicode(x)
-                if isinstance(x, unicode):
-                    x = x.encode("utf-8")
-                yield x
-        keyed = rdd.mapPartitions(func)
-        keyed._bypass_serializer = True
-        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
-        if schema is None:
-            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), samplingRatio)
-        else:
-            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
-            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
-        return DataFrame(df, self)
-
-    def load(self, path=None, source=None, schema=None, **options):
-        """Returns the dataset in a data source as a :class:`DataFrame`.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameReader.load` instead.
-        """
-        warnings.warn("load is deprecated. Use read.load() instead.")
-        return self.read.load(path, source, schema, **options)
-
     @since(1.3)
     def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
         """Creates an external table based on the dataset in a data source.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -36,7 +36,7 @@
 from pyspark.sql.readwriter import DataFrameWriter
 from pyspark.sql.types import *
 
-__all__ = ["DataFrame", "SchemaRDD", "DataFrameNaFunctions", "DataFrameStatFunctions"]
+__all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"]
 
 
 class DataFrame(object):
@@ -113,14 +113,6 @@ def toJSON(self, use_unicode=True):
         rdd = self._jdf.toJSON()
         return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
 
-    def saveAsParquetFile(self, path):
-        """Saves the contents as a Parquet file, preserving the schema.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.parquet` instead.
-        """
-        warnings.warn("saveAsParquetFile is deprecated. Use write.parquet() instead.")
-        self._jdf.saveAsParquetFile(path)
-
     @since(1.3)
     def registerTempTable(self, name):
         """Registers this RDD as a temporary table using the given name.
@@ -135,38 +127,6 @@ def registerTempTable(self, name):
         """
         self._jdf.registerTempTable(name)
 
-    def registerAsTable(self, name):
-        """
-        .. note:: Deprecated in 1.4, use :func:`registerTempTable` instead.
-        """
-        warnings.warn("Use registerTempTable instead of registerAsTable.")
-        self.registerTempTable(name)
-
-    def insertInto(self, tableName, overwrite=False):
-        """Inserts the contents of this :class:`DataFrame` into the specified table.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.insertInto` instead.
-        """
-        warnings.warn("insertInto is deprecated. Use write.insertInto() instead.")
-        self.write.insertInto(tableName, overwrite)
-
-    def saveAsTable(self, tableName, source=None, mode="error", **options):
-        """Saves the contents of this :class:`DataFrame` to a data source as a table.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.saveAsTable` instead.
-        """
-        warnings.warn("insertInto is deprecated. Use write.saveAsTable() instead.")
-        self.write.saveAsTable(tableName, source, mode, **options)
-
-    @since(1.3)
-    def save(self, path=None, source=None, mode="error", **options):
-        """Saves the contents of the :class:`DataFrame` to a data source.
-
-        .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.save` instead.
-        """
-        warnings.warn("insertInto is deprecated. Use write.save() instead.")
-        return self.write.save(path, source, mode, **options)
-
     @property
     @since(1.4)
     def write(self):
@@ -1388,12 +1348,6 @@ def toPandas(self):
     drop_duplicates = dropDuplicates
 
 
-# Having SchemaRDD for backward compatibility (for docs)
-class SchemaRDD(DataFrame):
-    """SchemaRDD is deprecated, please use :class:`DataFrame`.
-    """
-
-
 def _to_scala_map(sc, jm):
     """
     Convert a dict into a JVM Map.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -149,12 +149,8 @@ def _():
 }
 
 _window_functions = {
-    'rowNumber':
-        """.. note:: Deprecated in 1.6, use row_number instead.""",
     'row_number':
         """returns a sequential number starting at 1 within a window partition.""",
-    'denseRank':
-        """.. note:: Deprecated in 1.6, use dense_rank instead.""",
     'dense_rank':
         """returns the rank of rows within a window partition, without any gaps.
 
@@ -171,13 +167,9 @@ def _():
         place and that the next person came in third.
 
         This is equivalent to the RANK function in SQL.""",
-    'cumeDist':
-        """.. note:: Deprecated in 1.6, use cume_dist instead.""",
     'cume_dist':
         """returns the cumulative distribution of values within a window partition,
         i.e. the fraction of rows that are below the current row.""",
-    'percentRank':
-        """.. note:: Deprecated in 1.6, use percent_rank instead.""",
     'percent_rank':
         """returns the relative rank (i.e. percentile) of rows within a window partition.""",
 }
@@ -318,14 +310,6 @@ def isnull(col):
     return Column(sc._jvm.functions.isnull(_to_java_column(col)))
 
 
-@since(1.4)
-def monotonicallyIncreasingId():
-    """
-    .. note:: Deprecated in 1.6, use monotonically_increasing_id instead.
-    """
-    return monotonically_increasing_id()
-
-
 @since(1.6)
 def monotonically_increasing_id():
     """A column that generates monotonically increasing 64-bit integers.
@@ -434,14 +418,6 @@ def shiftRightUnsigned(col, numBits):
     return Column(jc)
 
 
-@since(1.4)
-def sparkPartitionId():
-    """
-    .. note:: Deprecated in 1.6, use spark_partition_id instead.
-    """
-    return spark_partition_id()
-
-
 @since(1.6)
 def spark_partition_id():
     """A column for partition ID of the Spark task.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -130,11 +130,9 @@ def load(self, path=None, format=None, schema=None, **options):
             self.schema(schema)
         self.options(**options)
         if path is not None:
-            if type(path) == list:
-                return self._df(
-                    self._jreader.load(self._sqlContext._sc._jvm.PythonUtils.toSeq(path)))
-            else:
-                return self._df(self._jreader.load(path))
+            if type(path) != list:
+                path = [path]
+            return self._df(self._jreader.load(self._sqlContext._sc._jvm.PythonUtils.toSeq(path)))
         else:
             return self._df(self._jreader.load())
 
@@ -179,7 +177,17 @@ def json(self, path, schema=None):
         elif type(path) == list:
             return self._df(self._jreader.json(self._sqlContext._sc._jvm.PythonUtils.toSeq(path)))
         elif isinstance(path, RDD):
-            return self._df(self._jreader.json(path._jrdd))
+            def func(iterator):
+                for x in iterator:
+                    if not isinstance(x, basestring):
+                        x = unicode(x)
+                    if isinstance(x, unicode):
+                        x = x.encode("utf-8")
+                    yield x
+            keyed = path.mapPartitions(func)
+            keyed._bypass_serializer = True
+            jrdd = keyed._jrdd.map(self._sqlContext._jvm.BytesToString())
+            return self._df(self._jreader.json(jrdd))
         else:
             raise TypeError("path can be only string or RDD")
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py