Skip to content

Commit 42a52a1

Browse files
authored
Merge branch 'master' into SPARK-32201
2 parents 0950e9a + db47c6e commit 42a52a1

File tree

678 files changed

+8923
-5754
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

678 files changed

+8923
-5754
lines changed

.github/workflows/master.yml

Lines changed: 186 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -9,148 +9,231 @@ on:
99
- master
1010

1111
jobs:
12+
# TODO(SPARK-32248): Recover JDK 11 builds
13+
# Build: build Spark and run the tests for specified modules.
1214
build:
13-
15+
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
1416
runs-on: ubuntu-latest
1517
strategy:
18+
fail-fast: false
1619
matrix:
17-
java: [ '1.8', '11' ]
18-
hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ]
19-
hive: [ 'hive-1.2', 'hive-2.3' ]
20-
exclude:
21-
- java: '11'
22-
hive: 'hive-1.2'
23-
- hadoop: 'hadoop-3.2'
24-
hive: 'hive-1.2'
25-
name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }}
26-
20+
java:
21+
- 1.8
22+
hadoop:
23+
- hadoop3.2
24+
hive:
25+
- hive2.3
26+
# TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
27+
# Kinesis tests depends on external Amazon kinesis service.
28+
# Note that the modules below are from sparktestsupport/modules.py.
29+
modules:
30+
- |-
31+
core, unsafe, kvstore, avro,
32+
network-common, network-shuffle, repl, launcher,
33+
examples, sketch, graphx
34+
- |-
35+
catalyst, hive-thriftserver
36+
- |-
37+
streaming, sql-kafka-0-10, streaming-kafka-0-10,
38+
mllib-local, mllib,
39+
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
40+
- |-
41+
pyspark-sql, pyspark-mllib, pyspark-resource
42+
- |-
43+
pyspark-core, pyspark-streaming, pyspark-ml
44+
- |-
45+
sparkr
46+
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
47+
included-tags: [""]
48+
excluded-tags: [""]
49+
comment: [""]
50+
include:
51+
# Hive tests
52+
- modules: hive
53+
java: 1.8
54+
hadoop: hadoop3.2
55+
hive: hive2.3
56+
included-tags: org.apache.spark.tags.SlowHiveTest
57+
comment: "- slow tests"
58+
- modules: hive
59+
java: 1.8
60+
hadoop: hadoop3.2
61+
hive: hive2.3
62+
excluded-tags: org.apache.spark.tags.SlowHiveTest
63+
comment: "- other tests"
64+
# SQL tests
65+
- modules: sql
66+
java: 1.8
67+
hadoop: hadoop3.2
68+
hive: hive2.3
69+
included-tags: org.apache.spark.tags.ExtendedSQLTest
70+
comment: "- slow tests"
71+
- modules: sql
72+
java: 1.8
73+
hadoop: hadoop3.2
74+
hive: hive2.3
75+
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
76+
comment: "- other tests"
77+
env:
78+
MODULES_TO_TEST: ${{ matrix.modules }}
79+
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
80+
INCLUDED_TAGS: ${{ matrix.included-tags }}
81+
HADOOP_PROFILE: ${{ matrix.hadoop }}
82+
HIVE_PROFILE: ${{ matrix.hive }}
83+
# GitHub Actions' default miniconda to use in pip packaging test.
84+
CONDA_PREFIX: /usr/share/miniconda
85+
GITHUB_PREV_SHA: ${{ github.event.before }}
2786
steps:
28-
- uses: actions/checkout@master
29-
# We split caches because GitHub Action Cache has a 400MB-size limit.
30-
- uses: actions/cache@v1
87+
- name: Checkout Spark repository
88+
uses: actions/checkout@v2
89+
# In order to fetch changed files
90+
with:
91+
fetch-depth: 0
92+
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
93+
- name: Cache Scala, SBT, Maven and Zinc
94+
uses: actions/cache@v1
3195
with:
3296
path: build
3397
key: build-${{ hashFiles('**/pom.xml') }}
3498
restore-keys: |
3599
build-
36-
- uses: actions/cache@v1
100+
- name: Cache Maven local repository
101+
uses: actions/cache@v2
37102
with:
38-
path: ~/.m2/repository/com
39-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }}
40-
restore-keys: |
41-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-
42-
- uses: actions/cache@v1
43-
with:
44-
path: ~/.m2/repository/org
45-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }}
46-
restore-keys: |
47-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-
48-
- uses: actions/cache@v1
49-
with:
50-
path: ~/.m2/repository/net
51-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }}
103+
path: ~/.m2/repository
104+
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
52105
restore-keys: |
53-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-
54-
- uses: actions/cache@v1
106+
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
107+
- name: Cache Ivy local repository
108+
uses: actions/cache@v2
55109
with:
56-
path: ~/.m2/repository/io
57-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }}
110+
path: ~/.ivy2/cache
111+
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
58112
restore-keys: |
59-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-
60-
- name: Set up JDK ${{ matrix.java }}
113+
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
114+
- name: Install JDK ${{ matrix.java }}
61115
uses: actions/setup-java@v1
62116
with:
63117
java-version: ${{ matrix.java }}
64-
- name: Build with Maven
65-
run: |
66-
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
67-
export MAVEN_CLI_OPTS="--no-transfer-progress"
68-
mkdir -p ~/.m2
69-
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
70-
rm -rf ~/.m2/repository/org/apache/spark
71-
72-
73-
lint:
74-
runs-on: ubuntu-latest
75-
name: Linters (Java/Scala/Python), licenses, dependencies
76-
steps:
77-
- uses: actions/checkout@master
78-
- uses: actions/setup-java@v1
118+
# PySpark
119+
- name: Install PyPy3
120+
# Note that order of Python installations here matters because default python3 is
121+
# overridden by pypy3.
122+
uses: actions/setup-python@v2
123+
if: contains(matrix.modules, 'pyspark')
79124
with:
80-
java-version: '11'
81-
- uses: actions/setup-python@v1
125+
python-version: pypy3
126+
architecture: x64
127+
- name: Install Python 3.6
128+
uses: actions/setup-python@v2
129+
if: contains(matrix.modules, 'pyspark')
82130
with:
83-
python-version: '3.x'
84-
architecture: 'x64'
85-
- name: Scala
86-
run: ./dev/lint-scala
87-
- name: Java
88-
run: ./dev/lint-java
89-
- name: Python
90-
run: |
91-
pip install flake8 sphinx numpy
92-
./dev/lint-python
93-
- name: License
94-
run: ./dev/check-license
95-
- name: Dependencies
96-
run: ./dev/test-dependencies.sh
97-
98-
lintr:
99-
runs-on: ubuntu-latest
100-
name: Linter (R)
101-
steps:
102-
- uses: actions/checkout@master
103-
- uses: actions/setup-java@v1
131+
python-version: 3.6
132+
architecture: x64
133+
- name: Install Python 3.8
134+
uses: actions/setup-python@v2
135+
# We should install one Python that is higher then 3+ for SQL and Yarn because:
136+
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
137+
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
138+
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
104139
with:
105-
java-version: '11'
106-
- uses: r-lib/actions/setup-r@v1
140+
python-version: 3.8
141+
architecture: x64
142+
- name: Install Python packages (Python 3.6 and PyPy3)
143+
if: contains(matrix.modules, 'pyspark')
144+
# PyArrow is not supported in PyPy yet, see ARROW-2651.
145+
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
146+
run: |
147+
python3.6 -m pip install numpy pyarrow pandas scipy
148+
python3.6 -m pip list
149+
pypy3 -m pip install numpy pandas
150+
pypy3 -m pip list
151+
- name: Install Python packages (Python 3.8)
152+
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
153+
run: |
154+
python3.8 -m pip install numpy pyarrow pandas scipy
155+
python3.8 -m pip list
156+
# SparkR
157+
- name: Install R 3.6
158+
uses: r-lib/actions/setup-r@v1
159+
if: contains(matrix.modules, 'sparkr')
107160
with:
108-
r-version: '3.6.2'
109-
- name: Install lib
161+
r-version: 3.6
162+
- name: Install R packages
163+
if: contains(matrix.modules, 'sparkr')
110164
run: |
111165
sudo apt-get install -y libcurl4-openssl-dev
112-
- name: install R packages
166+
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
167+
# Show installed packages in R.
168+
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
169+
# Run the tests.
170+
- name: "Run tests: ${{ matrix.modules }}"
113171
run: |
114-
sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
115-
sudo Rscript -e "devtools::install_github('jimhester/[email protected]')"
116-
- name: package and install SparkR
117-
run: ./R/install-dev.sh
118-
- name: lint-r
119-
run: ./dev/lint-r
172+
# Hive tests become flaky when running in parallel as it's too intensive.
173+
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
174+
mkdir -p ~/.m2
175+
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
176+
rm -rf ~/.m2/repository/org/apache/spark
120177
121-
docs:
178+
# Static analysis, and documentation build
179+
lint:
180+
name: Linters, licenses, dependencies and documentation generation
122181
runs-on: ubuntu-latest
123-
name: Generate documents
124182
steps:
125-
- uses: actions/checkout@master
126-
- uses: actions/cache@v1
183+
- name: Checkout Spark repository
184+
uses: actions/checkout@v2
185+
- name: Cache Maven local repository
186+
uses: actions/cache@v2
127187
with:
128188
path: ~/.m2/repository
129189
key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
130190
restore-keys: |
131-
docs-maven-repo-
132-
- uses: actions/setup-java@v1
191+
docs-maven-
192+
- name: Install JDK 1.8
193+
uses: actions/setup-java@v1
133194
with:
134-
java-version: '1.8'
135-
- uses: actions/setup-python@v1
195+
java-version: 1.8
196+
- name: Install Python 3.6
197+
uses: actions/setup-python@v2
136198
with:
137-
python-version: '3.x'
138-
architecture: 'x64'
139-
- uses: actions/setup-ruby@v1
199+
python-version: 3.6
200+
architecture: x64
201+
- name: Install Python linter dependencies
202+
run: |
203+
pip3 install flake8 sphinx numpy
204+
- name: Install R 3.6
205+
uses: r-lib/actions/setup-r@v1
140206
with:
141-
ruby-version: '2.7'
142-
- uses: r-lib/actions/setup-r@v1
207+
r-version: 3.6
208+
- name: Install R linter dependencies and SparkR
209+
run: |
210+
sudo apt-get install -y libcurl4-openssl-dev
211+
sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
212+
sudo Rscript -e "devtools::install_github('jimhester/[email protected]')"
213+
./R/install-dev.sh
214+
- name: Install Ruby 2.7 for documentation generation
215+
uses: actions/setup-ruby@v1
143216
with:
144-
r-version: '3.6.2'
145-
- name: Install lib and pandoc
217+
ruby-version: 2.7
218+
- name: Install dependencies for documentation generation
146219
run: |
147220
sudo apt-get install -y libcurl4-openssl-dev pandoc
148-
- name: Install packages
149-
run: |
150221
pip install sphinx mkdocs numpy
151222
gem install jekyll jekyll-redirect-from rouge
152-
sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
153-
- name: Run jekyll build
223+
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
224+
- name: Scala linter
225+
run: ./dev/lint-scala
226+
- name: Java linter
227+
run: ./dev/lint-java
228+
- name: Python linter
229+
run: ./dev/lint-python
230+
- name: R linter
231+
run: ./dev/lint-r
232+
- name: License test
233+
run: ./dev/check-license
234+
- name: Dependencies test
235+
run: ./dev/test-dependencies.sh
236+
- name: Run documentation build
154237
run: |
155238
cd docs
156239
jekyll build

R/pkg/tests/fulltests/test_context.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ test_that("utility function can be called", {
139139
expect_true(TRUE)
140140
})
141141

142-
test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
142+
test_that("getClientModeSparkSubmitOpts() returns spark-submit args from allowList", {
143143
e <- new.env()
144144
e[["spark.driver.memory"]] <- "512m"
145145
ops <- getClientModeSparkSubmitOpts("sparkrmain", e)

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3921,14 +3921,14 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
39213921
# before creating a SparkSession with enableHiveSupport = T at the top of this test file
39223922
# (filesBefore). The test here is to compare that (filesBefore) against the list of files before
39233923
# any test is run in run-all.R (sparkRFilesBefore).
3924-
# sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
3924+
# sparkRAllowedSQLDirs is also defined in run-all.R, and should contain only 2 allowed dirs,
39253925
# here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
39263926
# same as before any test is run.
3927-
compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
3927+
compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRAllowedSQLDirs[[1]]))
39283928
# third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
39293929
# note: as the note above, after running all tests in this file while enableHiveSupport = T, we
3930-
# check the list of files again. This time we allow both whitelisted dirs to be in the diff.
3931-
compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
3930+
# check the list of files again. This time we allow both dirs to be in the diff.
3931+
compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRAllowedSQLDirs))
39323932
})
39333933

39343934
unlink(parquetPath)

R/pkg/tests/run-all.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
3535
install.spark(overwrite = TRUE)
3636

3737
sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
38-
sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
39-
invisible(lapply(sparkRWhitelistSQLDirs,
38+
sparkRAllowedSQLDirs <- c("spark-warehouse", "metastore_db")
39+
invisible(lapply(sparkRAllowedSQLDirs,
4040
function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
4141
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
4242

appveyor.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ install:
4242
# Install maven and dependencies
4343
- ps: .\dev\appveyor-install-dependencies.ps1
4444
# Required package for R unit tests
45-
- cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
46-
- cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')"
45+
- cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
46+
- cmd: Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"
4747

4848
build_script:
4949
# '-Djna.nosys=true' is required to avoid kernel32.dll load failure.

common/network-common/src/main/java/org/apache/spark/network/crypto/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,4 @@ server will be able to understand. This will cause the server to close the conne
155155
attacker tries to send any command to the server. The attacker can just hold the channel open for
156156
some time, which will be closed when the server times out the channel. These issues could be
157157
separately mitigated by adding a shorter timeout for the first message after authentication, and
158-
potentially by adding host blacklists if a possible attack is detected from a particular host.
158+
potentially by adding host reject-lists if a possible attack is detected from a particular host.

0 commit comments

Comments
 (0)