|
26 | 26 |
|
27 | 27 | from pyspark import sql as spark
|
28 | 28 | from pyspark.sql import functions as F
|
| 29 | +from pyspark.sql.readwriter import OptionUtils |
29 | 30 | from pyspark.sql.types import DataType, DoubleType, FloatType
|
30 | 31 |
|
31 | 32 | from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
|
@@ -569,158 +570,83 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
|
569 | 570 | return validate_arguments_and_invoke_function(
|
570 | 571 | kdf._to_internal_pandas(), self.to_csv, f, args)
|
571 | 572 |
|
572 |
| - def to_json(self, path_or_buf=None, orient=None, date_format=None, |
573 |
| - double_precision=10, force_ascii=True, date_unit='ms', |
574 |
| - default_handler=None, lines=False, compression='infer', |
575 |
| - index=True): |
| 573 | + def to_json(self, path=None, compression='uncompressed', num_files=None, **kwargs): |
576 | 574 | """
|
577 | 575 | Convert the object to a JSON string.
|
578 | 576 |
|
| 577 | + .. note:: Koalas `to_json` writes files to a path or URI. Unlike pandas', Koalas |
| 578 | + respects HDFS's property such as 'fs.default.name'. |
| 579 | +
|
| 580 | + .. note:: Koalas writes JSON files into the directory, `path`, and writes |
| 581 | + multiple `part-...` files in the directory when `path` is specified. |
| 582 | + This behaviour was inherited from Apache Spark. The number of files can |
| 583 | + be controlled by `num_files`. |
| 584 | +
|
| 585 | + .. note:: output JSON format is different from pandas'. It always use `orient='records'` |
| 586 | + for its output. This behaviour might have to change in the near future. |
| 587 | +
|
579 | 588 | Note NaN's and None will be converted to null and datetime objects
|
580 | 589 | will be converted to UNIX timestamps.
|
581 | 590 |
|
582 |
| - .. note:: This method should only be used if the resulting JSON is expected |
583 |
| - to be small, as all the data is loaded into the driver's memory. |
584 |
| -
|
585 | 591 | Parameters
|
586 | 592 | ----------
|
587 |
| - path_or_buf : string or file handle, optional |
588 |
| - File path or object. If not specified, the result is returned as |
| 593 | + path : string, optional |
| 594 | + File path. If not specified, the result is returned as |
589 | 595 | a string.
|
590 |
| - orient : string |
591 |
| - Indication of expected JSON string format. |
592 |
| -
|
593 |
| - * Series |
594 |
| -
|
595 |
| - - default is 'index' |
596 |
| - - allowed values are: {'split','records','index','table'} |
597 |
| -
|
598 |
| - * DataFrame |
599 |
| -
|
600 |
| - - default is 'columns' |
601 |
| - - allowed values are: |
602 |
| - {'split','records','index','columns','values','table'} |
603 |
| -
|
604 |
| - * The format of the JSON string |
605 |
| -
|
606 |
| - - 'split' : dict like {'index' -> [index], |
607 |
| - 'columns' -> [columns], 'data' -> [values]} |
608 |
| - - 'records' : list like |
609 |
| - [{column -> value}, ... , {column -> value}] |
610 |
| - - 'index' : dict like {index -> {column -> value}} |
611 |
| - - 'columns' : dict like {column -> {index -> value}} |
612 |
| - - 'values' : just the values array |
613 |
| - - 'table' : dict like {'schema': {schema}, 'data': {data}} |
614 |
| - describing the data, and the data component is |
615 |
| - like ``orient='records'``. |
616 |
| - date_format : {None, 'epoch', 'iso'} |
617 |
| - Type of date conversion. 'epoch' = epoch milliseconds, |
618 |
| - 'iso' = ISO8601. The default depends on the `orient`. For |
619 |
| - ``orient='table'``, the default is 'iso'. For all other orients, |
620 |
| - the default is 'epoch'. |
621 |
| - double_precision : int, default 10 |
622 |
| - The number of decimal places to use when encoding |
623 |
| - floating point values. |
624 |
| - force_ascii : bool, default True |
625 |
| - Force encoded string to be ASCII. |
626 |
| - date_unit : string, default 'ms' (milliseconds) |
627 |
| - The time unit to encode to, governs timestamp and ISO8601 |
628 |
| - precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, |
629 |
| - microsecond, and nanosecond respectively. |
630 |
| - default_handler : callable, default None |
631 |
| - Handler to call if object cannot otherwise be converted to a |
632 |
| - suitable format for JSON. Should receive a single argument which is |
633 |
| - the object to convert and return a serialisable object. |
634 |
| - lines : bool, default False |
635 |
| - If 'orient' is 'records' write out line delimited json format. Will |
636 |
| - throw ValueError if incorrect 'orient' since others are not list |
637 |
| - like. |
638 |
| - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} |
| 596 | + date_format : str, default None |
| 597 | + Format string for datetime objects. |
| 598 | + compression : {'gzip', 'bz2', 'xz', None} |
639 | 599 | A string representing the compression to use in the output file,
|
640 | 600 | only used when the first argument is a filename. By default, the
|
641 | 601 | compression is inferred from the filename.
|
642 |
| - index : bool, default True |
643 |
| - Whether to include the index values in the JSON string. Not |
644 |
| - including the index (``index=False``) is only supported when |
645 |
| - orient is 'split' or 'table'. |
646 | 602 |
|
647 | 603 | Examples
|
648 | 604 | --------
|
649 |
| -
|
650 | 605 | >>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
|
651 |
| - ... index=['row 1', 'row 2'], |
652 | 606 | ... columns=['col 1', 'col 2'])
|
653 |
| - >>> df.to_json(orient='split') |
654 |
| - '{"columns":["col 1","col 2"],\ |
655 |
| -"index":["row 1","row 2"],\ |
656 |
| -"data":[["a","b"],["c","d"]]}' |
657 |
| -
|
658 |
| - >>> df['col 1'].to_json(orient='split') |
659 |
| - '{"name":"col 1","index":["row 1","row 2"],"data":["a","c"]}' |
660 |
| -
|
661 |
| - Encoding/decoding a Dataframe using ``'records'`` formatted JSON. |
662 |
| - Note that index labels are not preserved with this encoding. |
663 |
| -
|
664 |
| - >>> df.to_json(orient='records') |
| 607 | + >>> df.to_json() |
665 | 608 | '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
|
666 | 609 |
|
667 |
| - >>> df['col 1'].to_json(orient='records') |
668 |
| - '["a","c"]' |
669 |
| -
|
670 |
| - Encoding/decoding a Dataframe using ``'index'`` formatted JSON: |
671 |
| -
|
672 |
| - >>> df.to_json(orient='index') |
673 |
| - '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' |
674 |
| -
|
675 |
| - >>> df['col 1'].to_json(orient='index') |
676 |
| - '{"row 1":"a","row 2":"c"}' |
677 |
| -
|
678 |
| - Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: |
679 |
| -
|
680 |
| - >>> df.to_json(orient='columns') |
681 |
| - '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}' |
682 |
| -
|
683 |
| - >>> df['col 1'].to_json(orient='columns') |
684 |
| - '{"row 1":"a","row 2":"c"}' |
685 |
| -
|
686 |
| - Encoding/decoding a Dataframe using ``'values'`` formatted JSON: |
687 |
| -
|
688 |
| - >>> df.to_json(orient='values') |
689 |
| - '[["a","b"],["c","d"]]' |
690 |
| -
|
691 |
| - >>> df['col 1'].to_json(orient='values') |
692 |
| - '["a","c"]' |
693 |
| -
|
694 |
| - Encoding with Table Schema |
695 |
| -
|
696 |
| - >>> df.to_json(orient='table') # doctest: +SKIP |
697 |
| - '{"schema": {"fields":[{"name":"index","type":"string"},\ |
698 |
| -{"name":"col 1","type":"string"},\ |
699 |
| -{"name":"col 2","type":"string"}],\ |
700 |
| -"primaryKey":["index"],\ |
701 |
| -"pandas_version":"0.20.0"}, \ |
702 |
| -"data": [{"index":"row 1","col 1":"a","col 2":"b"},\ |
703 |
| -{"index":"row 2","col 1":"c","col 2":"d"}]}' |
704 |
| -
|
705 |
| - >>> df['col 1'].to_json(orient='table') # doctest: +SKIP |
706 |
| - '{"schema": {"fields":[{"name":"index","type":"string"},\ |
707 |
| -{"name":"col 1","type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"}, \ |
708 |
| -"data": [{"index":"row 1","col 1":"a"},{"index":"row 2","col 1":"c"}]}' |
| 610 | + >>> df['col 1'].to_json() |
| 611 | + '[{"col 1":"a"},{"col 1":"c"}]' |
| 612 | +
|
| 613 | + >>> df.to_json(path=r'%s/to_json/foo.json' % path, num_files=1) |
| 614 | + >>> ks.read_json( |
| 615 | + ... path=r'%s/to_json/foo.json' % path |
| 616 | + ... ).sort_values(by="col 1") |
| 617 | + col 1 col 2 |
| 618 | + 0 a b |
| 619 | + 1 c d |
| 620 | +
|
| 621 | + >>> df['col 1'].to_json(path=r'%s/to_json/foo.json' % path, num_files=1) |
| 622 | + >>> ks.read_json( |
| 623 | + ... path=r'%s/to_json/foo.json' % path |
| 624 | + ... ).sort_values(by="col 1") |
| 625 | + col 1 |
| 626 | + 0 a |
| 627 | + 1 c |
709 | 628 | """
|
710 |
| - # Make sure locals() call is at the top of the function so we don't capture local variables. |
711 |
| - args = locals() |
| 629 | + if path is None: |
| 630 | + # If path is none, just collect and use pandas's to_json. |
| 631 | + kdf_or_ser = self |
| 632 | + pdf = kdf_or_ser.to_pandas() |
| 633 | + if isinstance(self, ks.Series): |
| 634 | + pdf = pdf.to_frame() |
| 635 | + # To make the format consistent and readable by `read_json`, convert it to pandas' and |
| 636 | + # use 'records' orient for now. |
| 637 | + return pdf.to_json(orient='records') |
| 638 | + |
712 | 639 | kdf = self
|
| 640 | + if isinstance(self, ks.Series): |
| 641 | + kdf = self._kdf |
| 642 | + sdf = kdf._sdf |
713 | 643 |
|
714 |
| - if isinstance(self, ks.DataFrame): |
715 |
| - f = pd.DataFrame.to_json |
716 |
| - elif isinstance(self, ks.Series): |
717 |
| - f = pd.Series.to_json |
718 |
| - else: |
719 |
| - raise TypeError('Constructor expects DataFrame or Series; however, ' |
720 |
| - 'got [%s]' % (self,)) |
| 644 | + if num_files is not None: |
| 645 | + sdf = sdf.repartition(num_files) |
721 | 646 |
|
722 |
| - return validate_arguments_and_invoke_function( |
723 |
| - kdf._to_internal_pandas(), self.to_json, f, args) |
| 647 | + builder = sdf.select(self._internal.data_columns).write.mode("overwrite") |
| 648 | + OptionUtils._set_opts(builder, compression=compression) |
| 649 | + builder.options(**kwargs).format("json").save(path) |
724 | 650 |
|
725 | 651 | def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None,
|
726 | 652 | columns=None, header=True, index=True, index_label=None, startrow=0,
|
|
0 commit comments