mccheah · mccheah · Oct 16, 2020 · Sep 10, 2020 · Sep 10, 2020 · Sep 11, 2020
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -17,7 +17,8 @@ jobs:
   # Build: build Spark and run the tests for specified modules.
   build:
     name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
-    runs-on: ubuntu-latest
+    # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
+    runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
       matrix:
@@ -154,12 +155,11 @@ jobs:
     - name: Install Python packages (Python 3.6 and PyPy3)
       if: contains(matrix.modules, 'pyspark')
       # PyArrow is not supported in PyPy yet, see ARROW-2651.
-      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
       run: |
         python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
         python3.6 -m pip list
         # PyPy does not have xmlrunner
-        pypy3 -m pip install numpy pandas
+        pypy3 -m pip install numpy pandas scipy
         pypy3 -m pip list
     - name: Install Python packages (Python 3.8)
       if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
@@ -168,12 +168,10 @@ jobs:
         python3.8 -m pip list
     # SparkR
     - name: Install R 4.0
+      uses: r-lib/actions/setup-r@v1
       if: contains(matrix.modules, 'sparkr')
-      run: |
-        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
-        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
-        sudo apt-get update
-        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+      with:
+        r-version: 4.0
     - name: Install R packages
       if: contains(matrix.modules, 'sparkr')
       run: |
@@ -206,7 +204,7 @@ jobs:
   # Static analysis, and documentation build
   lint:
     name: Linters, licenses, dependencies and documentation generation
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -232,11 +230,9 @@ jobs:
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx
     - name: Install R 4.0
-      run: |
-        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
-        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
-        sudo apt-get update
-        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+      uses: r-lib/actions/setup-r@v1
+      with:
+        r-version: 4.0
     - name: Install R linter dependencies and SparkR
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
@@ -275,7 +271,7 @@ jobs:
 
   java11:
     name: Java 11 build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -297,3 +293,25 @@ jobs:
         mkdir -p ~/.m2
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
         rm -rf ~/.m2/repository/org/apache/spark
+
+  scala-213:
+    name: Scala 2.13 build
+    runs-on: ubuntu-20.04
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.ivy2/cache
+        key: scala-213-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        restore-keys: |
+          scala-213-ivy-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
+      with:
+        java-version: 11
+    - name: Build with SBT
+      run: |
+        ./dev/change-scala-version.sh 2.13
+        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
@@ -15,7 +15,16 @@ jobs:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         workflow: ${{ github.event.workflow_run.workflow_id }}
         commit: ${{ github.event.workflow_run.head_commit.id }}
+    - name: Check if JUnit report XML files exist
+      run: |
+        if ls **/target/test-reports/*.xml > /dev/null 2>&1; then
+          echo '::set-output name=FILE_EXISTS::true'
+        else
+          echo '::set-output name=FILE_EXISTS::false'
+        fi
+      id: check-junit-file
     - name: Publish test report
+      if: steps.check-junit-file.outputs.FILE_EXISTS == 'true'
       uses: scacap/action-surefire-report@v1
       with:
         check_name: Report test results

diff --git a/.sbtopts b/.sbtopts
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+-J-Xmx4G
+-J-Xss4m
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -230,6 +230,7 @@ exportMethods("%<=>%",
               "asc",
               "ascii",
               "asin",
+              "assert_true",
               "atan",
               "atan2",
               "avg",
@@ -272,6 +273,7 @@ exportMethods("%<=>%",
               "degrees",
               "dense_rank",
               "desc",
+              "dropFields",
               "element_at",
               "encode",
               "endsWith",
@@ -348,6 +350,7 @@ exportMethods("%<=>%",
               "negate",
               "next_day",
               "not",
+              "nth_value",
               "ntile",
               "otherwise",
               "over",
@@ -359,6 +362,7 @@ exportMethods("%<=>%",
               "posexplode_outer",
               "quarter",
               "radians",
+              "raise_error",
               "rand",
               "randn",
               "rank",
@@ -405,6 +409,7 @@ exportMethods("%<=>%",
               "sumDistinct",
               "tan",
               "tanh",
+              "timestamp_seconds",
               "toDegrees",
               "toRadians",
               "to_csv",
@@ -425,9 +430,11 @@ exportMethods("%<=>%",
               "variance",
               "var_pop",
               "var_samp",
+              "vector_to_array",
               "weekofyear",
               "when",
               "window",
+              "withField",
               "xxhash64",
               "year")
 

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2863,11 +2863,18 @@ setMethod("unionAll",
 #' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
 #' into account. Input SparkDataFrames can have different data types in the schema.
 #'
+#' When the parameter allowMissingColumns is `TRUE`, the set of column names
+#' in x and y can differ; missing columns will be filled as null.
+#' Further, the missing columns of x will be added at the end
+#' in the schema of the union result.
+#'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #' This function resolves columns by name (not by position).
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
+#' @param allowMissingColumns logical
+#' @param ... further arguments to be passed to or from other methods.
 #' @return A SparkDataFrame containing the result of the union.
 #' @family SparkDataFrame functions
 #' @rdname unionByName
@@ -2880,12 +2887,15 @@ setMethod("unionAll",
 #' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
 #' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
 #' head(unionByName(df1, df2))
+#'
+#' df3 <- select(createDataFrame(mtcars), "carb")
+#' head(unionByName(df1, df3, allowMissingColumns = TRUE))
 #' }
 #' @note unionByName since 2.3.0
 setMethod("unionByName",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
-          function(x, y) {
-            unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+          function(x, y, allowMissingColumns=FALSE) {
+            unioned <- callJMethod(x@sdf, "unionByName", y@sdf, allowMissingColumns)
             dataFrame(unioned)
           })
 

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -356,3 +356,103 @@ setMethod("%<=>%",
 #' }
 #' @note ! since 2.3.0
 setMethod("!", signature(x = "Column"), function(x) not(x))
+
+#' withField
+#'
+#' Adds/replaces field in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param fieldName a character
+#' @param col a Column expression
+#'
+#' @rdname withField
+#' @aliases withField withField,Column-method
+#' @examples
+#' \dontrun{
+#' df <- withColumn(
+#'   createDataFrame(iris),
+#'   "sepal",
+#'    struct(column("Sepal_Width"), column("Sepal_Length"))
+#' )
+#'
+#' head(select(
+#'   df,
+#'   withField(df$sepal, "product", df$Sepal_Length * df$Sepal_Width)
+#' ))
+#' }
+#' @note withField since 3.1.0
+setMethod("withField",
+          signature(x = "Column", fieldName = "character", col = "Column"),
+          function(x, fieldName, col) {
+            jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
+            column(jc)
+          })
+
+#' dropFields
+#'
+#' Drops fields in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param ... names of the fields to be dropped.
+#'
+#' @rdname dropFields
+#' @aliases dropFields dropFields,Column-method
+#' @examples
+#' \dontrun{
+#' df <- select(
+#'   createDataFrame(iris),
+#'   alias(
+#'     struct(
+#'       column("Sepal_Width"), column("Sepal_Length"),
+#'       alias(
+#'         struct(
+#'           column("Petal_Width"), column("Petal_Length"),
+#'           alias(
+#'             column("Petal_Width") * column("Petal_Length"),
+#'             "Petal_Product"
+#'           )
+#'         ),
+#'         "Petal"
+#'       )
+#'     ),
+#'     "dimensions"
+#'   )
+#' )
+#' head(withColumn(df, "dimensions", dropFields(df$dimensions, "Petal")))
+#'
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     dropFields(df$dimensions, "Sepal_Width", "Sepal_Length")
+#'   )
+#' )
+#'
+#' # This method supports dropping multiple nested fields directly e.g.
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     dropFields(df$dimensions, "Petal.Petal_Width", "Petal.Petal_Length")
+#'   )
+#' )
+#'
+#' # However, if you are going to add/replace multiple nested fields,
+#' # it is preffered to extract out the nested struct before
+#' # adding/replacing multiple fields e.g.
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     withField(
+#'       column("dimensions"),
+#'       "Petal",
+#'       dropFields(column("dimensions.Petal"), "Petal_Width", "Petal_Length")
+#'     )
+#'   )
+#' )
+#' }
+#' @note dropFields since 3.1.0
+setMethod("dropFields",
+          signature(x = "Column"),
+          function(x, ...) {
+            jc <- callJMethod(x@jc, "dropFields", list(...))
+            column(jc)
+          })