Skip to content

Commit e9becfc

Browse files
committed
Add read_json and let to_json use spark.write.json
1 parent 3457f61 commit e9becfc

File tree

4 files changed

+110
-141
lines changed

4 files changed

+110
-141
lines changed

databricks/koalas/generic.py

+55-129
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from pyspark import sql as spark
2828
from pyspark.sql import functions as F
29+
from pyspark.sql.readwriter import OptionUtils
2930
from pyspark.sql.types import DataType, DoubleType, FloatType
3031

3132
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
@@ -569,158 +570,83 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
569570
return validate_arguments_and_invoke_function(
570571
kdf._to_internal_pandas(), self.to_csv, f, args)
571572

572-
def to_json(self, path_or_buf=None, orient=None, date_format=None,
573-
double_precision=10, force_ascii=True, date_unit='ms',
574-
default_handler=None, lines=False, compression='infer',
575-
index=True):
573+
def to_json(self, path=None, compression='uncompressed', num_files=None, **kwargs):
576574
"""
577575
Convert the object to a JSON string.
578576
577+
.. note:: Koalas `to_json` writes files to a path or URI. Unlike pandas', Koalas
578+
respects HDFS's property such as 'fs.default.name'.
579+
580+
.. note:: Koalas writes JSON files into the directory, `path`, and writes
581+
multiple `part-...` files in the directory when `path` is specified.
582+
This behaviour was inherited from Apache Spark. The number of files can
583+
be controlled by `num_files`.
584+
585+
.. note:: output JSON format is different from pandas'. It always use `orient='records'`
586+
for its output. This behaviour might have to change in the near future.
587+
579588
Note NaN's and None will be converted to null and datetime objects
580589
will be converted to UNIX timestamps.
581590
582-
.. note:: This method should only be used if the resulting JSON is expected
583-
to be small, as all the data is loaded into the driver's memory.
584-
585591
Parameters
586592
----------
587-
path_or_buf : string or file handle, optional
588-
File path or object. If not specified, the result is returned as
593+
path : string, optional
594+
File path. If not specified, the result is returned as
589595
a string.
590-
orient : string
591-
Indication of expected JSON string format.
592-
593-
* Series
594-
595-
- default is 'index'
596-
- allowed values are: {'split','records','index','table'}
597-
598-
* DataFrame
599-
600-
- default is 'columns'
601-
- allowed values are:
602-
{'split','records','index','columns','values','table'}
603-
604-
* The format of the JSON string
605-
606-
- 'split' : dict like {'index' -> [index],
607-
'columns' -> [columns], 'data' -> [values]}
608-
- 'records' : list like
609-
[{column -> value}, ... , {column -> value}]
610-
- 'index' : dict like {index -> {column -> value}}
611-
- 'columns' : dict like {column -> {index -> value}}
612-
- 'values' : just the values array
613-
- 'table' : dict like {'schema': {schema}, 'data': {data}}
614-
describing the data, and the data component is
615-
like ``orient='records'``.
616-
date_format : {None, 'epoch', 'iso'}
617-
Type of date conversion. 'epoch' = epoch milliseconds,
618-
'iso' = ISO8601. The default depends on the `orient`. For
619-
``orient='table'``, the default is 'iso'. For all other orients,
620-
the default is 'epoch'.
621-
double_precision : int, default 10
622-
The number of decimal places to use when encoding
623-
floating point values.
624-
force_ascii : bool, default True
625-
Force encoded string to be ASCII.
626-
date_unit : string, default 'ms' (milliseconds)
627-
The time unit to encode to, governs timestamp and ISO8601
628-
precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
629-
microsecond, and nanosecond respectively.
630-
default_handler : callable, default None
631-
Handler to call if object cannot otherwise be converted to a
632-
suitable format for JSON. Should receive a single argument which is
633-
the object to convert and return a serialisable object.
634-
lines : bool, default False
635-
If 'orient' is 'records' write out line delimited json format. Will
636-
throw ValueError if incorrect 'orient' since others are not list
637-
like.
638-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
596+
date_format : str, default None
597+
Format string for datetime objects.
598+
compression : {'gzip', 'bz2', 'xz', None}
639599
A string representing the compression to use in the output file,
640600
only used when the first argument is a filename. By default, the
641601
compression is inferred from the filename.
642-
index : bool, default True
643-
Whether to include the index values in the JSON string. Not
644-
including the index (``index=False``) is only supported when
645-
orient is 'split' or 'table'.
646602
647603
Examples
648604
--------
649-
650605
>>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
651-
... index=['row 1', 'row 2'],
652606
... columns=['col 1', 'col 2'])
653-
>>> df.to_json(orient='split')
654-
'{"columns":["col 1","col 2"],\
655-
"index":["row 1","row 2"],\
656-
"data":[["a","b"],["c","d"]]}'
657-
658-
>>> df['col 1'].to_json(orient='split')
659-
'{"name":"col 1","index":["row 1","row 2"],"data":["a","c"]}'
660-
661-
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
662-
Note that index labels are not preserved with this encoding.
663-
664-
>>> df.to_json(orient='records')
607+
>>> df.to_json()
665608
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
666609
667-
>>> df['col 1'].to_json(orient='records')
668-
'["a","c"]'
669-
670-
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
671-
672-
>>> df.to_json(orient='index')
673-
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
674-
675-
>>> df['col 1'].to_json(orient='index')
676-
'{"row 1":"a","row 2":"c"}'
677-
678-
Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
679-
680-
>>> df.to_json(orient='columns')
681-
'{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
682-
683-
>>> df['col 1'].to_json(orient='columns')
684-
'{"row 1":"a","row 2":"c"}'
685-
686-
Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
687-
688-
>>> df.to_json(orient='values')
689-
'[["a","b"],["c","d"]]'
690-
691-
>>> df['col 1'].to_json(orient='values')
692-
'["a","c"]'
693-
694-
Encoding with Table Schema
695-
696-
>>> df.to_json(orient='table') # doctest: +SKIP
697-
'{"schema": {"fields":[{"name":"index","type":"string"},\
698-
{"name":"col 1","type":"string"},\
699-
{"name":"col 2","type":"string"}],\
700-
"primaryKey":["index"],\
701-
"pandas_version":"0.20.0"}, \
702-
"data": [{"index":"row 1","col 1":"a","col 2":"b"},\
703-
{"index":"row 2","col 1":"c","col 2":"d"}]}'
704-
705-
>>> df['col 1'].to_json(orient='table') # doctest: +SKIP
706-
'{"schema": {"fields":[{"name":"index","type":"string"},\
707-
{"name":"col 1","type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"}, \
708-
"data": [{"index":"row 1","col 1":"a"},{"index":"row 2","col 1":"c"}]}'
610+
>>> df['col 1'].to_json()
611+
'[{"col 1":"a"},{"col 1":"c"}]'
612+
613+
>>> df.to_json(path=r'%s/to_json/foo.json' % path, num_files=1)
614+
>>> ks.read_json(
615+
... path=r'%s/to_json/foo.json' % path
616+
... ).sort_values(by="col 1")
617+
col 1 col 2
618+
0 a b
619+
1 c d
620+
621+
>>> df['col 1'].to_json(path=r'%s/to_json/foo.json' % path, num_files=1)
622+
>>> ks.read_json(
623+
... path=r'%s/to_json/foo.json' % path
624+
... ).sort_values(by="col 1")
625+
col 1
626+
0 a
627+
1 c
709628
"""
710-
# Make sure locals() call is at the top of the function so we don't capture local variables.
711-
args = locals()
629+
if path is None:
630+
# If path is none, just collect and use pandas's to_json.
631+
kdf_or_ser = self
632+
pdf = kdf_or_ser.to_pandas()
633+
if isinstance(self, ks.Series):
634+
pdf = pdf.to_frame()
635+
# To make the format consistent and readable by `read_json`, convert it to pandas' and
636+
# use 'records' orient for now.
637+
return pdf.to_json(orient='records')
638+
712639
kdf = self
640+
if isinstance(self, ks.Series):
641+
kdf = self._kdf
642+
sdf = kdf._sdf
713643

714-
if isinstance(self, ks.DataFrame):
715-
f = pd.DataFrame.to_json
716-
elif isinstance(self, ks.Series):
717-
f = pd.Series.to_json
718-
else:
719-
raise TypeError('Constructor expects DataFrame or Series; however, '
720-
'got [%s]' % (self,))
644+
if num_files is not None:
645+
sdf = sdf.repartition(num_files)
721646

722-
return validate_arguments_and_invoke_function(
723-
kdf._to_internal_pandas(), self.to_json, f, args)
647+
builder = sdf.select(self._internal.data_columns).write.mode("overwrite")
648+
OptionUtils._set_opts(builder, compression=compression)
649+
builder.options(**kwargs).format("json").save(path)
724650

725651
def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None,
726652
columns=None, header=True, index=True, index_label=None, startrow=0,

databricks/koalas/namespace.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
__all__ = ["from_pandas", "range", "read_csv", "read_delta", "read_table", "read_spark_io",
4242
"read_parquet", "read_clipboard", "read_excel", "read_html", "to_datetime",
4343
"get_dummies", "concat", "melt", "isna", "isnull", "notna", "notnull",
44-
"read_sql_table", "read_sql_query", "read_sql"]
44+
"read_sql_table", "read_sql_query", "read_sql", "read_json"]
4545

4646

4747
def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']:
@@ -241,6 +241,31 @@ def read_csv(path, header='infer', names=None, usecols=None,
241241
return DataFrame(sdf)
242242

243243

244+
def read_json(path: str, **options):
245+
"""
246+
Convert a JSON string to pandas object.
247+
248+
Parameters
249+
----------
250+
path : string
251+
File path
252+
253+
Examples
254+
--------
255+
>>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
256+
... columns=['col 1', 'col 2'])
257+
258+
>>> df.to_json(path=r'%s/read_json/foo.json' % path, num_files=1)
259+
>>> ks.read_json(
260+
... path=r'%s/read_json/foo.json' % path
261+
... ).sort_values(by="col 1")
262+
col 1 col 2
263+
0 a b
264+
1 c d
265+
"""
266+
return read_spark_io(path, format='json', options=options)
267+
268+
244269
def read_delta(path: str, version: Optional[str] = None, timestamp: Optional[str] = None,
245270
**options) -> DataFrame:
246271
"""

databricks/koalas/tests/test_dataframe_conversion.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
#
16-
16+
import os
17+
import shutil
1718
import string
19+
import tempfile
1820

1921
import numpy as np
2022
import pandas as pd
@@ -27,6 +29,12 @@
2729
class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils, TestUtils):
2830
"""Test cases for "small data" conversion and I/O."""
2931

32+
def setUp(self):
33+
self.tmp_dir = tempfile.mkdtemp(prefix=DataFrameConversionTest.__name__)
34+
35+
def tearDown(self):
36+
shutil.rmtree(self.tmp_dir, ignore_errors=True)
37+
3038
@property
3139
def pdf(self):
3240
return pd.DataFrame({
@@ -162,16 +170,19 @@ def test_to_json(self):
162170
pdf = self.pdf
163171
kdf = ks.from_pandas(pdf)
164172

165-
self.assert_eq(kdf.to_json(), pdf.to_json())
166-
self.assert_eq(kdf.to_json(orient='split'), pdf.to_json(orient='split'))
167-
self.assert_eq(kdf.to_json(orient='records'), pdf.to_json(orient='records'))
168-
self.assert_eq(kdf.to_json(orient='index'), pdf.to_json(orient='index'))
169-
self.assert_eq(kdf.to_json(orient='values'), pdf.to_json(orient='values'))
170-
self.assert_eq(kdf.to_json(orient='table'), pdf.to_json(orient='table'))
171-
self.assert_eq(kdf.to_json(orient='records', lines=True),
172-
pdf.to_json(orient='records', lines=True))
173-
self.assert_eq(kdf.to_json(orient='split', index=False),
174-
pdf.to_json(orient='split', index=False))
173+
self.assert_eq(kdf.to_json(), pdf.to_json(orient='records'))
174+
175+
def test_to_json_with_path(self):
176+
pdf = pd.DataFrame({'a': [1], 'b': ['a']})
177+
kdf = ks.DataFrame(pdf)
178+
179+
kdf.to_json(self.tmp_dir, num_files=1)
180+
expected = pdf.to_json(orient='records')
181+
182+
output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")]
183+
assert len(output_paths) > 0
184+
output_path = "%s/%s" % (self.tmp_dir, output_paths[0])
185+
self.assertEqual("[%s]" % open(output_path).read().strip(), expected)
175186

176187
def test_to_clipboard(self):
177188
pdf = self.pdf

docs/source/reference/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ Excel
6969
read_excel
7070
DataFrame.to_excel
7171

72+
JSON
73+
----
74+
.. autosummary::
75+
:toctree: api/
76+
77+
read_json
78+
7279
HTML
7380
----
7481
.. autosummary::

0 commit comments

Comments
 (0)