databricks · Dec 14, 2020 · Dec 16, 2020 · Dec 16, 2020 · Dec 16, 2020 · Dec 18, 2020
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -20,18 +20,33 @@ jobs:
             spark-version: 2.3.4
             pandas-version: 0.23.4
             pyarrow-version: 0.16.0
-          - python-version: 3.5
+            numpy-version: 1.18.5
+          - python-version: 3.6
             spark-version: 2.3.4
             pandas-version: 0.24.2
             pyarrow-version: 0.10.0
+            numpy-version: 1.19.5
+            default-index-type: 'distributed-sequence'
+          - python-version: 3.9
+            spark-version: 3.1.2
+            pandas-version: 1.2.5
+            pyarrow-version: 3.0.0
+            numpy-version: 1.20.3
+          - python-version: 3.9
+            spark-version: 3.2.0
+            pandas-version: 1.2.5
+            pyarrow-version: 4.0.1
+            numpy-version: 1.21.2
             default-index-type: 'distributed-sequence'
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
       SPARK_VERSION: ${{ matrix.spark-version }}
       PANDAS_VERSION: ${{ matrix.pandas-version }}
       PYARROW_VERSION: ${{ matrix.pyarrow-version }}
+      NUMPY_VERSION: ${{ matrix.numpy-version }}
       DEFAULT_INDEX_TYPE: ${{ matrix.default-index-type }}
       KOALAS_TESTING: 1
+      SPARK_LOCAL_IP: 127.0.0.1
       # DISPLAY=0.0 does not work in Github Actions with Python 3.5. Here we work around with xvfb-run
       PYTHON_EXECUTABLE: xvfb-run python
       # Github token is required to auto-generate the release notes from Github release notes
@@ -61,8 +76,12 @@ jobs:
         # as Black only works with Python 3.6+. This is hacky but we will drop
         # Python 3.5 soon so it's fine.
         if [[ "$PYTHON_VERSION" < "3.6" ]]; then sed -i '/black/d' requirements-dev.txt; fi
+        # sphinx-plotly-directive supports Python 3.6+
+        if [[ "$PYTHON_VERSION" < "3.6" ]]; then sed -i '/sphinx-plotly-directive/d' requirements-dev.txt; fi
+        # Disable mypy check for PySpark 3.1
+        if [[ "$SPARK_VERSION" > "3.1" ]]; then sed -i '/mypy/d' requirements-dev.txt; fi
         pip install -r requirements-dev.txt
-        pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION pyspark==$SPARK_VERSION
+        pip install pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION pyspark==$SPARK_VERSION numpy==$NUMPY_VERSION
         # matplotlib dropped Python 3.5 support from 3.1.x; however, 3.0.3 only supports sphinx 2.x.
         # It forces the sphinx version to 2.x.
         if [[ "$PYTHON_VERSION" < "3.6" ]]; then pip install "sphinx<3.0.0"; fi
@@ -86,37 +105,45 @@ jobs:
             spark-version: 2.4.7
             pandas-version: 0.24.2
             pyarrow-version: 0.14.1
+            numpy-version: 1.19.5
             logger: databricks.koalas.usage_logging.usage_logger
-          - python-version: 3.6
-            spark-version: 2.4.7
-            pandas-version: 0.25.3
-            pyarrow-version: 0.15.1
-            default-index-type: 'distributed-sequence'
           - python-version: 3.7
             spark-version: 2.4.7
             pandas-version: 0.25.3
-            pyarrow-version: 0.14.1
-          - python-version: 3.7
-            spark-version: 2.4.7
-            pandas-version: 1.0.5
             pyarrow-version: 0.15.1
+            numpy-version: 1.19.5
             default-index-type: 'distributed-sequence'
           - python-version: 3.7
-            spark-version: 3.0.1
-            pandas-version: 0.25.3
+            spark-version: 3.0.2
+            pandas-version: 1.0.5
             pyarrow-version: 1.0.1
+            numpy-version: 1.19.5
+          - python-version: 3.7
+            spark-version: 3.1.1
+            pandas-version: 1.1.5
+            pyarrow-version: 2.0.0
+            numpy-version: 1.19.5
+            default-index-type: 'distributed-sequence'
           - python-version: 3.8
-            spark-version: 3.0.1
+            spark-version: 3.0.2
             pandas-version: 1.1.5
             pyarrow-version: 2.0.0
+            numpy-version: 1.19.5
+          - python-version: 3.8
+            spark-version: 3.1.1
+            pandas-version: 1.2.5
+            pyarrow-version: 3.0.0
+            numpy-version: 1.20.3
             default-index-type: 'distributed-sequence'
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}
       SPARK_VERSION: ${{ matrix.spark-version }}
       PANDAS_VERSION: ${{ matrix.pandas-version }}
       PYARROW_VERSION: ${{ matrix.pyarrow-version }}
+      NUMPY_VERSION: ${{ matrix.numpy-version }}
       DEFAULT_INDEX_TYPE: ${{ matrix.default-index-type }}
       KOALAS_TESTING: 1
+      SPARK_LOCAL_IP: 127.0.0.1
       # `QT_QPA_PLATFORM` for resolving 'QXcbConnection: Could not connect to display :0.0'
       DISPLAY: 0.0
       QT_QPA_PLATFORM: offscreen
@@ -141,15 +168,21 @@ jobs:
         conda config --env --add pinned_packages python=$PYTHON_VERSION
         conda config --env --add pinned_packages pandas==$PANDAS_VERSION
         conda config --env --add pinned_packages pyarrow==$PYARROW_VERSION
+        conda config --env --add pinned_packages numpy==$NUMPY_VERSION
         conda config --env --add pinned_packages pyspark==$SPARK_VERSION
         if [[ "$SPARK_VERSION" < "3.0" ]]; then
           pip install pyspark==$SPARK_VERSION
         else
           conda install -c conda-forge --yes pyspark==$SPARK_VERSION
         fi
-        conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION
-        sed -i -e "/pandas/d" -e "/pyarrow/d" requirements-dev.txt
+        conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION numpy==$NUMPY_VERSION
+        sed -i -e "/pandas/d" -e "/pyarrow/d" -e "/numpy>=/d" requirements-dev.txt
+        # Disable mypy check for PySpark 3.1
+        if [[ "$SPARK_VERSION" > "3.1" ]]; then sed -i '/mypy/d' requirements-dev.txt; fi
+        # sphinx-plotly-directive is not available on Conda.
+        sed -i '/sphinx-plotly-directive/d' requirements-dev.txt
         conda install -c conda-forge --yes --file requirements-dev.txt
+        pip install sphinx-plotly-directive  # pip-only dependency
         conda list
     - name: Run tests
       run: |

diff --git a/README.md b/README.md
@@ -1,3 +1,5 @@
+## DEPRECATED: Koalas supports Apache Spark 3.1 and below as it is [officially included to PySpark in Apache Spark 3.2](https://issues.apache.org/jira/browse/SPARK-34849). This repository is now in maintenance mode. For Apache Spark 3.2 and above, please use [PySpark](https://spark.apache.org/docs/latest/api/python/migration_guide/koalas_to_pyspark.html) directly.
+
 <p align="center">
   <img src="https://raw.githubusercontent.com/databricks/koalas/master/icons/koalas-logo.png" width="140"/>
 </p>
@@ -52,7 +54,7 @@ pip install koalas
 
 See [Installation](https://koalas.readthedocs.io/en/latest/getting_started/install.html) for more details.
 
-For Databricks Runtime users, Koalas is pre-installed in Databricks Runtime 7.1 and above, or you can follow these [steps](https://docs.databricks.com/libraries/index.html) to install a library on Databricks.
+For Databricks Runtime, Koalas is pre-installed in Databricks Runtime 7.1 and above. Try [Databricks Community Edition](https://community.cloud.databricks.com/) for free. You can also follow these [steps](https://docs.databricks.com/libraries/index.html) to manually install a library on Databricks.
 
 Lastly, if your PyArrow version is 0.15+ and your PySpark version is lower than 3.0, it is best for you to set `ARROW_PRE_0_15_IPC_FORMAT` environment variable to `1` manually.
 Koalas will try its best to set it for you but it is impossible to set it if there is a Spark context already launched.

diff --git a/apt.txt b/apt.txt
@@ -0,0 +1 @@
+openjdk-8-jre
diff --git a/databricks/conftest.py b/databricks/conftest.py
@@ -25,7 +25,6 @@
 
 import pandas as pd
 import pyarrow as pa
-import matplotlib.pyplot as plt
 from pyspark import __version__
 
 from databricks import koalas as ks
@@ -102,12 +101,6 @@ def add_caplog(caplog):
         yield
 
 
-@pytest.fixture(autouse=True)
-def close_figs():
-    yield
-    plt.close("all")
-
-
 @pytest.fixture(autouse=True)
 def check_options():
     orig_default_index_type = ks.options.compute.default_index_type

diff --git a/databricks/koalas/__init__.py b/databricks/koalas/__init__.py
@@ -20,10 +20,31 @@
 from databricks.koalas.version import __version__  # noqa: F401
 
 
+def assert_python_version():
+    import warnings
+
+    major = 3
+    minor = 5
+    deprecated_version = (major, minor)
+    min_supported_version = (major, minor + 1)
+
+    if sys.version_info[:2] <= deprecated_version:
+        warnings.warn(
+            "Koalas support for Python {dep_ver} is deprecated and will be dropped in "
+            "the future release. At that point, existing Python {dep_ver} workflows "
+            "that use Koalas will continue to work without modification, but Python {dep_ver} "
+            "users will no longer get access to the latest Koalas features and bugfixes. "
+            "We recommend that you upgrade to Python {min_ver} or newer.".format(
+                dep_ver=".".join(map(str, deprecated_version)),
+                min_ver=".".join(map(str, min_supported_version)),
+            ),
+            FutureWarning,
+        )
+
+
 def assert_pyspark_version():
     import logging
 
-    pyspark_ver = None
     try:
         import pyspark
     except ImportError:
@@ -33,17 +54,42 @@ def assert_pyspark_version():
         )
     else:
         pyspark_ver = getattr(pyspark, "__version__")
-        if pyspark_ver is None or pyspark_ver < "2.4":
+        if pyspark_ver is None or LooseVersion(pyspark_ver) < LooseVersion("2.4"):
             logging.warning(
                 'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
                     pyspark_ver if pyspark_ver is not None else "<unknown version>"
                 )
             )
+        elif LooseVersion(pyspark_ver) >= LooseVersion("3.2"):
+            logging.warning(
+                'Found pyspark version "{}" installed. The pyspark version 3.2 and above has '
+                'a built-in "pandas APIs on Spark" module ported from Koalas. '
+                "Try `import pyspark.pandas as ps` instead. ".format(pyspark_ver)
+            )
 
 
+assert_python_version()
 assert_pyspark_version()
 
 import pyspark
+import numpy
+
+if LooseVersion(pyspark.__version__) < LooseVersion("3.1") and LooseVersion(
+    numpy.__version__
+) >= LooseVersion("1.20"):
+    import logging
+
+    logging.warning(
+        'Found numpy version "{numpy_version}" installed with pyspark version "{pyspark_version}". '
+        "Some functions will not work well with this combination of "
+        'numpy version "{numpy_version}" and pyspark version "{pyspark_version}". '
+        "Please try to upgrade pyspark version to 3.1 or above, "
+        "or downgrade numpy version to below 1.20.".format(
+            numpy_version=numpy.__version__, pyspark_version=pyspark.__version__
+        )
+    )
+
+
 import pyarrow
 
 if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
@@ -86,20 +132,29 @@ def assert_pyspark_version():
     os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
 
 from databricks.koalas.frame import DataFrame
-from databricks.koalas.indexes import Index, MultiIndex
+from databricks.koalas.indexes.base import Index
+from databricks.koalas.indexes.category import CategoricalIndex
+from databricks.koalas.indexes.datetimes import DatetimeIndex
+from databricks.koalas.indexes.multi import MultiIndex
+from databricks.koalas.indexes.numeric import Float64Index, Int64Index
 from databricks.koalas.series import Series
 from databricks.koalas.groupby import NamedAgg
 
 __all__ = [  # noqa: F405
     "read_csv",
     "read_parquet",
     "to_datetime",
+    "date_range",
     "from_pandas",
     "get_dummies",
     "DataFrame",
     "Series",
     "Index",
     "MultiIndex",
+    "Int64Index",
+    "Float64Index",
+    "CategoricalIndex",
+    "DatetimeIndex",
     "sql",
     "range",
     "concat",