Skip to content

Commit

Permalink
Add lines and orient to read_json and to_json to improve error message (
Browse files Browse the repository at this point in the history
#2110)

The current behaviour of Koalas is:

```python
pd.read_json(lines=True)
df.to_json(orient='records', lines=True)
```

This PR adds the arguments with pretty error messages.
  • Loading branch information
HyukjinKwon authored Mar 22, 2021
1 parent c6e596d commit deccf33
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 2 deletions.
14 changes: 14 additions & 0 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,8 @@ def to_json(
compression="uncompressed",
num_files=None,
mode: str = "overwrite",
orient="records",
lines=True,
partition_cols: Optional[Union[str, List[str]]] = None,
index_col: Optional[Union[str, List[str]]] = None,
**options
Expand All @@ -870,6 +872,12 @@ def to_json(
path : string, optional
File path. If not specified, the result is returned as
a string.
lines : bool, default True
If ‘orient’ is ‘records’ write out line delimited json format.
Will throw ValueError if incorrect ‘orient’ since others are not
list like. It should be always True for now.
orient : str, default 'records'
It should be always 'records' for now.
compression : {'gzip', 'bz2', 'xz', None}
A string representing the compression to use in the output file,
only used when the first argument is a filename. By default, the
Expand Down Expand Up @@ -930,6 +938,12 @@ def to_json(
if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
options = options.get("options") # type: ignore

if not lines:
raise NotImplementedError("lines=False is not implemented yet.")

if orient != "records":
raise NotImplementedError("orient='records' is supported only for now.")

if path is None:
# If path is none, just collect and use pandas's to_json.
kdf_or_ser = self
Expand Down
9 changes: 8 additions & 1 deletion databricks/koalas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,14 +412,18 @@ def read_csv(
return kdf


def read_json(path: str, index_col: Optional[Union[str, List[str]]] = None, **options) -> DataFrame:
def read_json(
path: str, lines: bool = True, index_col: Optional[Union[str, List[str]]] = None, **options
) -> DataFrame:
"""
Convert a JSON string to DataFrame.
Parameters
----------
path : string
File path
lines : bool, default True
Read the file as a json object per line. It should be always True for now.
index_col : str or list of str, optional, default: None
Index column of table in Spark.
options : dict
Expand Down Expand Up @@ -460,6 +464,9 @@ def read_json(path: str, index_col: Optional[Union[str, List[str]]] = None, **op
if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
options = options.get("options") # type: ignore

if not lines:
raise NotImplementedError("lines=False is not implemented yet.")

return read_spark_io(path, format="json", index_col=index_col, **options)


Expand Down
15 changes: 14 additions & 1 deletion databricks/koalas/tests/test_dataframe_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,20 @@ def test_to_json(self):
pdf = self.pdf
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.to_json(), pdf.to_json(orient="records"))
self.assert_eq(kdf.to_json(orient="records"), pdf.to_json(orient="records"))

def test_to_json_negative(self):
kdf = ks.from_pandas(self.pdf)

with self.assertRaises(NotImplementedError):
kdf.to_json(orient="table")

with self.assertRaises(NotImplementedError):
kdf.to_json(lines=False)

def test_read_json_negative(self):
with self.assertRaises(NotImplementedError):
ks.read_json("invalid", lines=False)

def test_to_json_with_path(self):
pdf = pd.DataFrame({"a": [1], "b": ["a"]})
Expand Down

0 comments on commit deccf33

Please sign in to comment.