Add lines and orient to read_json and to_json to improve error message (

#2110) The current behaviour of Koalas is: ```python pd.read_json(lines=True) df.to_json(orient='records', lines=True) ``` This PR adds the arguments with pretty error messages.
databricks · Mar 22, 2021 · deccf33 · deccf33
1 parent c6e596d
commit deccf33
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 2 deletions.
diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py
@@ -844,6 +844,8 @@ def to_json(
         compression="uncompressed",
         num_files=None,
         mode: str = "overwrite",
+        orient="records",
+        lines=True,
         partition_cols: Optional[Union[str, List[str]]] = None,
         index_col: Optional[Union[str, List[str]]] = None,
         **options
@@ -870,6 +872,12 @@ def to_json(
         path : string, optional
             File path. If not specified, the result is returned as
             a string.
+        lines : bool, default True
+            If ‘orient’ is ‘records’ write out line delimited json format.
+            Will throw ValueError if incorrect ‘orient’ since others are not
+            list like. It should be always True for now.
+        orient : str, default 'records'
+             It should be always 'records' for now.
         compression : {'gzip', 'bz2', 'xz', None}
             A string representing the compression to use in the output file,
             only used when the first argument is a filename. By default, the
@@ -930,6 +938,12 @@ def to_json(
         if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
             options = options.get("options")  # type: ignore
 
+        if not lines:
+            raise NotImplementedError("lines=False is not implemented yet.")
+
+        if orient != "records":
+            raise NotImplementedError("orient='records' is supported only for now.")
+
         if path is None:
             # If path is none, just collect and use pandas's to_json.
             kdf_or_ser = self

diff --git a/databricks/koalas/namespace.py b/databricks/koalas/namespace.py
@@ -412,14 +412,18 @@ def read_csv(
         return kdf
 
 
-def read_json(path: str, index_col: Optional[Union[str, List[str]]] = None, **options) -> DataFrame:
+def read_json(
+    path: str, lines: bool = True, index_col: Optional[Union[str, List[str]]] = None, **options
+) -> DataFrame:
     """
     Convert a JSON string to DataFrame.
 
     Parameters
     ----------
     path : string
         File path
+    lines : bool, default True
+        Read the file as a json object per line. It should be always True for now.
     index_col : str or list of str, optional, default: None
         Index column of table in Spark.
     options : dict
@@ -460,6 +464,9 @@ def read_json(path: str, index_col: Optional[Union[str, List[str]]] = None, **op
     if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
         options = options.get("options")  # type: ignore
 
+    if not lines:
+        raise NotImplementedError("lines=False is not implemented yet.")
+
     return read_spark_io(path, format="json", index_col=index_col, **options)
 
 

diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py
@@ -139,7 +139,20 @@ def test_to_json(self):
         pdf = self.pdf
         kdf = ks.from_pandas(pdf)
 
-        self.assert_eq(kdf.to_json(), pdf.to_json(orient="records"))
+        self.assert_eq(kdf.to_json(orient="records"), pdf.to_json(orient="records"))
+
+    def test_to_json_negative(self):
+        kdf = ks.from_pandas(self.pdf)
+
+        with self.assertRaises(NotImplementedError):
+            kdf.to_json(orient="table")
+
+        with self.assertRaises(NotImplementedError):
+            kdf.to_json(lines=False)
+
+    def test_read_json_negative(self):
+        with self.assertRaises(NotImplementedError):
+            ks.read_json("invalid", lines=False)
 
     def test_to_json_with_path(self):
         pdf = pd.DataFrame({"a": [1], "b": ["a"]})