Skip to content

Commit bc3b72b

Browse files
committed
More improvements to DataFrame Python doc.
1 parent ac1d4c0 commit bc3b72b

File tree

2 files changed

+152
-149
lines changed

2 files changed

+152
-149
lines changed

python/pyspark/sql/context.py

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
def _monkey_patch_RDD(sqlCtx):
4141
def toDF(self, schema=None, sampleRatio=None):
4242
"""
43-
Convert current :class:`RDD` into a :class:`DataFrame`
43+
Converts current :class:`RDD` into a :class:`DataFrame`
4444
4545
This is a shorthand for ``sqlCtx.createDataFrame(rdd, schema, sampleRatio)``
4646
@@ -59,13 +59,14 @@ def toDF(self, schema=None, sampleRatio=None):
5959
class SQLContext(object):
6060
"""Main entry point for Spark SQL functionality.
6161
62-
A SQLContext can be used create L{DataFrame}, register L{DataFrame} as
62+
A SQLContext can be used create :class:`DataFrame`, register :class:`DataFrame` as
6363
tables, execute SQL over tables, cache tables, and read parquet files.
6464
65-
When created, L{SQLContext} adds a method called ``toDF`` to :class:`RDD`, which could be
66-
used to convert an RDD into a DataFrame, it's a shorthand for L{SQLContext.createDataFrame}.
65+
When created, :class:`SQLContext` adds a method called ``toDF`` to :class:`RDD`,
66+
which could be used to convert an RDD into a DataFrame, it's a shorthand for
67+
:func:`SQLContext.createDataFrame`.
6768
68-
:param sparkContext: The SparkContext to wrap.
69+
:param sparkContext: The :class:`SparkContext` backing this SQLContext.
6970
:param sqlContext: An optional JVM Scala SQLContext. If set, we do not instantiate a new
7071
SQLContext in the JVM, instead we make all calls to this object.
7172
"""
@@ -209,21 +210,23 @@ def applySchema(self, rdd, schema):
209210

210211
def createDataFrame(self, data, schema=None, samplingRatio=None):
211212
"""
212-
Create a DataFrame from an RDD of L{tuple}/L{list}, list or L{pandas.DataFrame}.
213+
Creates a :class:`DataFrame` from an :class:`RDD` of :class:`tuple`/:class:`list`,
214+
list or :class:`pandas.DataFrame`.
213215
214216
When ``schema`` is a list of column names, the type of each column
215-
will be inferred from ``rdd``.
217+
will be inferred from ``data``.
216218
217219
When ``schema`` is ``None``, it will try to infer the schema (column names and types)
218-
from ``rdd``, which should be an RDD of :class:`Row`, or L{namedtuple}, or L{dict}.
220+
from ``data``, which should be an RDD of :class:`Row`,
221+
or :class:`namedtuple`, or :class:`dict`.
219222
220223
If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
221224
rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
222225
223-
:param data: an RDD of Row/tuple/list/dict, list, or pandas.DataFrame
224-
:param schema: a StructType or list of column names. default None.
226+
:param data: an RDD of :class:`Row`/:class:`tuple`/:class:`list`/:class:`dict`,
227+
:class:`list`, or :class:`pandas.DataFrame`.
228+
:param schema: a :class:`StructType` or list of column names. default None.
225229
:param samplingRatio: the sample ratio of rows used for inferring
226-
:return: a L{DataFrame}
227230
228231
>>> l = [('Alice', 1)]
229232
>>> sqlCtx.createDataFrame(l).collect()
@@ -309,9 +312,9 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
309312
return DataFrame(df, self)
310313

311314
def registerDataFrameAsTable(self, df, tableName):
312-
"""Registers the given L{DataFrame} as a temporary table in the catalog.
315+
"""Registers the given :class:`DataFrame` as a temporary table in the catalog.
313316
314-
Temporary tables exist only during the lifetime of this instance of L{SQLContext}.
317+
Temporary tables exist only during the lifetime of this instance of :class:`SQLContext`.
315318
316319
>>> sqlCtx.registerDataFrameAsTable(df, "table1")
317320
"""
@@ -321,7 +324,7 @@ def registerDataFrameAsTable(self, df, tableName):
321324
raise ValueError("Can only register DataFrame as table")
322325

323326
def parquetFile(self, *paths):
324-
"""Loads a Parquet file, returning the result as a L{DataFrame}.
327+
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
325328
326329
>>> import tempfile, shutil
327330
>>> parquetFile = tempfile.mkdtemp()
@@ -339,8 +342,7 @@ def parquetFile(self, *paths):
339342
return DataFrame(jdf, self)
340343

341344
def jsonFile(self, path, schema=None, samplingRatio=1.0):
342-
"""
343-
Loads a text file storing one JSON object per line as a L{DataFrame}.
345+
"""Loads a text file storing one JSON object per line as a :class:`DataFrame`.
344346
345347
If the schema is provided, applies the given schema to this JSON dataset.
346348
Otherwise, it samples the dataset with ratio ``samplingRatio`` to determine the schema.
@@ -379,7 +381,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
379381
return DataFrame(df, self)
380382

381383
def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
382-
"""Loads an RDD storing one JSON object per string as a L{DataFrame}.
384+
"""Loads an RDD storing one JSON object per string as a :class:`DataFrame`.
383385
384386
If the schema is provided, applies the given schema to this JSON dataset.
385387
Otherwise, it samples the dataset with ratio ``samplingRatio`` to determine the schema.
@@ -421,7 +423,7 @@ def func(iterator):
421423
return DataFrame(df, self)
422424

423425
def load(self, path=None, source=None, schema=None, **options):
424-
"""Returns the dataset in a data source as a L{DataFrame}.
426+
"""Returns the dataset in a data source as a :class:`DataFrame`.
425427
426428
The data source is specified by the ``source`` and a set of ``options``.
427429
If ``source`` is not specified, the default data source configured by
@@ -455,7 +457,7 @@ def createExternalTable(self, tableName, path=None, source=None,
455457
If ``source`` is not specified, the default data source configured by
456458
``spark.sql.sources.default`` will be used.
457459
458-
Optionally, a schema can be provided as the schema of the returned L{DataFrame} and
460+
Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
459461
created external table.
460462
"""
461463
if path is not None:
@@ -476,7 +478,7 @@ def createExternalTable(self, tableName, path=None, source=None,
476478
return DataFrame(df, self)
477479

478480
def sql(self, sqlQuery):
479-
"""Returns a L{DataFrame} representing the result of the given query.
481+
"""Returns a :class:`DataFrame` representing the result of the given query.
480482
481483
>>> sqlCtx.registerDataFrameAsTable(df, "table1")
482484
>>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
@@ -486,7 +488,7 @@ def sql(self, sqlQuery):
486488
return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
487489

488490
def table(self, tableName):
489-
"""Returns the specified table as a L{DataFrame}.
491+
"""Returns the specified table as a :class:`DataFrame`.
490492
491493
>>> sqlCtx.registerDataFrameAsTable(df, "table1")
492494
>>> df2 = sqlCtx.table("table1")
@@ -496,12 +498,12 @@ def table(self, tableName):
496498
return DataFrame(self._ssql_ctx.table(tableName), self)
497499

498500
def tables(self, dbName=None):
499-
"""Returns a L{DataFrame} containing names of tables in the given database.
501+
"""Returns a :class:`DataFrame` containing names of tables in the given database.
500502
501503
If ``dbName`` is not specified, the current database will be used.
502504
503505
The returned DataFrame has two columns: ``tableName`` and ``isTemporary``
504-
(a column with L{BooleanType} indicating if a table is a temporary one or not).
506+
(a column with :class:`BooleanType` indicating if a table is a temporary one or not).
505507
506508
>>> sqlCtx.registerDataFrameAsTable(df, "table1")
507509
>>> df2 = sqlCtx.tables()
@@ -545,12 +547,12 @@ def clearCache(self):
545547
class HiveContext(SQLContext):
546548
"""A variant of Spark SQL that integrates with data stored in Hive.
547549
548-
Configuration for Hive is read from hive-site.xml on the classpath.
550+
Configuration for Hive is read from ``hive-site.xml`` on the classpath.
549551
It supports running both SQL and HiveQL commands.
550552
551553
:param sparkContext: The SparkContext to wrap.
552554
:param hiveContext: An optional JVM Scala HiveContext. If set, we do not instantiate a new
553-
L{HiveContext} in the JVM, instead we make all calls to this object.
555+
:class:`HiveContext` in the JVM, instead we make all calls to this object.
554556
"""
555557

556558
def __init__(self, sparkContext, hiveContext=None):

0 commit comments

Comments
 (0)