Skip to content

Commit 6848b2f

Browse files
committed
Merge branch 'master' into SPARK-32991
2 parents ccbf0ae + 306872e commit 6848b2f

File tree

98 files changed

+2212
-293
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+2212
-293
lines changed

.github/workflows/build_and_test.yml

Lines changed: 100 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ jobs:
1717
# Build: build Spark and run the tests for specified modules.
1818
build:
1919
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
20-
runs-on: ubuntu-latest
20+
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
21+
runs-on: ubuntu-20.04
2122
strategy:
2223
fail-fast: false
2324
matrix:
@@ -41,10 +42,6 @@ jobs:
4142
streaming, sql-kafka-0-10, streaming-kafka-0-10,
4243
mllib-local, mllib,
4344
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
44-
- >-
45-
pyspark-sql, pyspark-mllib, pyspark-resource
46-
- >-
47-
pyspark-core, pyspark-streaming, pyspark-ml
4845
- >-
4946
sparkr
5047
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
@@ -127,42 +124,17 @@ jobs:
127124
uses: actions/setup-java@v1
128125
with:
129126
java-version: ${{ matrix.java }}
130-
# PySpark
131-
- name: Install PyPy3
132-
# Note that order of Python installations here matters because default python3 is
133-
# overridden by pypy3.
134-
uses: actions/setup-python@v2
135-
if: contains(matrix.modules, 'pyspark')
136-
with:
137-
python-version: pypy3
138-
architecture: x64
139-
- name: Install Python 3.6
140-
uses: actions/setup-python@v2
141-
if: contains(matrix.modules, 'pyspark')
142-
with:
143-
python-version: 3.6
144-
architecture: x64
145127
- name: Install Python 3.8
146128
uses: actions/setup-python@v2
147129
# We should install one Python that is higher then 3+ for SQL and Yarn because:
148130
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
149131
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
150-
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
132+
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
151133
with:
152134
python-version: 3.8
153135
architecture: x64
154-
- name: Install Python packages (Python 3.6 and PyPy3)
155-
if: contains(matrix.modules, 'pyspark')
156-
# PyArrow is not supported in PyPy yet, see ARROW-2651.
157-
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
158-
run: |
159-
python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
160-
python3.6 -m pip list
161-
# PyPy does not have xmlrunner
162-
pypy3 -m pip install numpy pandas
163-
pypy3 -m pip list
164136
- name: Install Python packages (Python 3.8)
165-
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
137+
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
166138
run: |
167139
python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
168140
python3.8 -m pip list
@@ -201,10 +173,97 @@ jobs:
201173
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
202174
path: "**/target/unit-tests.log"
203175

176+
pyspark:
177+
name: "Build modules: ${{ matrix.modules }}"
178+
runs-on: ubuntu-20.04
179+
container:
180+
image: dongjoon/apache-spark-github-action-image:20201015
181+
strategy:
182+
fail-fast: false
183+
matrix:
184+
modules:
185+
- >-
186+
pyspark-sql, pyspark-mllib, pyspark-resource
187+
- >-
188+
pyspark-core, pyspark-streaming, pyspark-ml
189+
env:
190+
MODULES_TO_TEST: ${{ matrix.modules }}
191+
HADOOP_PROFILE: hadoop3.2
192+
HIVE_PROFILE: hive2.3
193+
# GitHub Actions' default miniconda to use in pip packaging test.
194+
CONDA_PREFIX: /usr/share/miniconda
195+
GITHUB_PREV_SHA: ${{ github.event.before }}
196+
GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
197+
steps:
198+
- name: Checkout Spark repository
199+
uses: actions/checkout@v2
200+
# In order to fetch changed files
201+
with:
202+
fetch-depth: 0
203+
- name: Merge dispatched input branch
204+
if: ${{ github.event.inputs.target != '' }}
205+
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
206+
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
207+
- name: Cache Scala, SBT, Maven and Zinc
208+
uses: actions/cache@v2
209+
with:
210+
path: |
211+
build/apache-maven-*
212+
build/zinc-*
213+
build/scala-*
214+
build/*.jar
215+
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
216+
restore-keys: |
217+
build-
218+
- name: Cache Maven local repository
219+
uses: actions/cache@v2
220+
with:
221+
path: ~/.m2/repository
222+
key: pyspark-maven-${{ hashFiles('**/pom.xml') }}
223+
restore-keys: |
224+
pyspark-maven-
225+
- name: Cache Ivy local repository
226+
uses: actions/cache@v2
227+
with:
228+
path: ~/.ivy2/cache
229+
key: pyspark-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
230+
restore-keys: |
231+
pyspark-ivy-
232+
- name: Install Python 3.6
233+
uses: actions/setup-python@v2
234+
with:
235+
python-version: 3.6
236+
architecture: x64
237+
# This step takes much less time (~30s) than other Python versions so it is not included
238+
# in the Docker image being used. There is also a technical issue to install Python 3.6 on
239+
# Ubuntu 20.04. See also SPARK-33162.
240+
- name: Install Python packages (Python 3.6)
241+
run: |
242+
python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
243+
python3.6 -m pip list
244+
# Run the tests.
245+
- name: Run tests
246+
run: |
247+
mkdir -p ~/.m2
248+
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
249+
rm -rf ~/.m2/repository/org/apache/spark
250+
- name: Upload test results to report
251+
if: always()
252+
uses: actions/upload-artifact@v2
253+
with:
254+
name: test-results-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3
255+
path: "**/target/test-reports/*.xml"
256+
- name: Upload unit tests log files
257+
if: failure()
258+
uses: actions/upload-artifact@v2
259+
with:
260+
name: unit-tests-log-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3
261+
path: "**/target/unit-tests.log"
262+
204263
# Static analysis, and documentation build
205264
lint:
206265
name: Linters, licenses, dependencies and documentation generation
207-
runs-on: ubuntu-latest
266+
runs-on: ubuntu-20.04
208267
steps:
209268
- name: Checkout Spark repository
210269
uses: actions/checkout@v2
@@ -271,7 +330,7 @@ jobs:
271330
272331
java11:
273332
name: Java 11 build
274-
runs-on: ubuntu-latest
333+
runs-on: ubuntu-20.04
275334
steps:
276335
- name: Checkout Spark repository
277336
uses: actions/checkout@v2
@@ -296,26 +355,22 @@ jobs:
296355
297356
scala-213:
298357
name: Scala 2.13 build
299-
runs-on: ubuntu-latest
358+
runs-on: ubuntu-20.04
300359
steps:
301360
- name: Checkout Spark repository
302361
uses: actions/checkout@v2
303-
- name: Cache Maven local repository
362+
- name: Cache Ivy local repository
304363
uses: actions/cache@v2
305364
with:
306-
path: ~/.m2/repository
307-
key: scala-213-maven-${{ hashFiles('**/pom.xml') }}
365+
path: ~/.ivy2/cache
366+
key: scala-213-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
308367
restore-keys: |
309-
scala-213-maven-
368+
scala-213-ivy-
310369
- name: Install Java 11
311370
uses: actions/setup-java@v1
312371
with:
313372
java-version: 11
314-
- name: Build with Maven
373+
- name: Build with SBT
315374
run: |
316-
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
317-
export MAVEN_CLI_OPTS="--no-transfer-progress"
318-
mkdir -p ~/.m2
319375
./dev/change-scala-version.sh 2.13
320-
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 -Pscala-2.13 install
321-
rm -rf ~/.m2/repository/org/apache/spark
376+
./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile

common/network-common/pom.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@
9191
<groupId>org.apache.commons</groupId>
9292
<artifactId>commons-crypto</artifactId>
9393
</dependency>
94+
<dependency>
95+
<groupId>org.roaringbitmap</groupId>
96+
<artifactId>RoaringBitmap</artifactId>
97+
</dependency>
9498

9599
<!-- Test dependencies -->
96100
<dependency>

common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
package org.apache.spark.network.protocol;
1919

20+
import java.io.IOException;
2021
import java.nio.charset.StandardCharsets;
2122

2223
import io.netty.buffer.ByteBuf;
24+
import org.roaringbitmap.RoaringBitmap;
2325

2426
/** Provides a canonical set of Encoders for simple types. */
2527
public class Encoders {
@@ -44,6 +46,40 @@ public static String decode(ByteBuf buf) {
4446
}
4547
}
4648

49+
/** Bitmaps are encoded with their serialization length followed by the serialization bytes. */
50+
public static class Bitmaps {
51+
public static int encodedLength(RoaringBitmap b) {
52+
// Compress the bitmap before serializing it. Note that since BlockTransferMessage
53+
// needs to invoke encodedLength first to figure out the length for the ByteBuf, it
54+
// guarantees that the bitmap will always be compressed before being serialized.
55+
b.trim();
56+
b.runOptimize();
57+
return b.serializedSizeInBytes();
58+
}
59+
60+
public static void encode(ByteBuf buf, RoaringBitmap b) {
61+
int encodedLength = b.serializedSizeInBytes();
62+
// RoaringBitmap requires nio ByteBuffer for serde. We expose the netty ByteBuf as a nio
63+
// ByteBuffer. Here, we need to explicitly manage the index so we can write into the
64+
// ByteBuffer, and the write is reflected in the underneath ByteBuf.
65+
b.serialize(buf.nioBuffer(buf.writerIndex(), encodedLength));
66+
buf.writerIndex(buf.writerIndex() + encodedLength);
67+
}
68+
69+
public static RoaringBitmap decode(ByteBuf buf) {
70+
RoaringBitmap bitmap = new RoaringBitmap();
71+
try {
72+
bitmap.deserialize(buf.nioBuffer());
73+
// RoaringBitmap deserialize does not advance the reader index of the underlying ByteBuf.
74+
// Manually update the index here.
75+
buf.readerIndex(buf.readerIndex() + bitmap.serializedSizeInBytes());
76+
} catch (IOException e) {
77+
throw new RuntimeException("Exception while decoding bitmap", e);
78+
}
79+
return bitmap;
80+
}
81+
}
82+
4783
/** Byte arrays are encoded with their length followed by bytes. */
4884
public static class ByteArrays {
4985
public static int encodedLength(byte[] arr) {
@@ -135,4 +171,31 @@ public static long[] decode(ByteBuf buf) {
135171
return longs;
136172
}
137173
}
174+
175+
/** Bitmap arrays are encoded with the number of bitmaps followed by per-Bitmap encoding. */
176+
public static class BitmapArrays {
177+
public static int encodedLength(RoaringBitmap[] bitmaps) {
178+
int totalLength = 4;
179+
for (RoaringBitmap b : bitmaps) {
180+
totalLength += Bitmaps.encodedLength(b);
181+
}
182+
return totalLength;
183+
}
184+
185+
public static void encode(ByteBuf buf, RoaringBitmap[] bitmaps) {
186+
buf.writeInt(bitmaps.length);
187+
for (RoaringBitmap b : bitmaps) {
188+
Bitmaps.encode(buf, b);
189+
}
190+
}
191+
192+
public static RoaringBitmap[] decode(ByteBuf buf) {
193+
int numBitmaps = buf.readInt();
194+
RoaringBitmap[] bitmaps = new RoaringBitmap[numBitmaps];
195+
for (int i = 0; i < bitmaps.length; i ++) {
196+
bitmaps[i] = Bitmaps.decode(buf);
197+
}
198+
return bitmaps;
199+
}
200+
}
138201
}

common/network-shuffle/pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@
5757
<groupId>com.google.guava</groupId>
5858
<artifactId>guava</artifactId>
5959
</dependency>
60+
<dependency>
61+
<groupId>org.roaringbitmap</groupId>
62+
<artifactId>RoaringBitmap</artifactId>
63+
</dependency>
6064

6165
<!-- Test dependencies -->
6266
<dependency>
@@ -93,6 +97,11 @@
9397
<artifactId>mockito-core</artifactId>
9498
<scope>test</scope>
9599
</dependency>
100+
<dependency>
101+
<groupId>commons-io</groupId>
102+
<artifactId>commons-io</artifactId>
103+
<scope>test</scope>
104+
</dependency>
96105
</dependencies>
97106

98107
<build>

common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockStoreClient.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.slf4j.Logger;
3030
import org.slf4j.LoggerFactory;
3131

32+
import org.apache.spark.network.buffer.ManagedBuffer;
3233
import org.apache.spark.network.client.RpcResponseCallback;
3334
import org.apache.spark.network.client.TransportClient;
3435
import org.apache.spark.network.client.TransportClientFactory;
@@ -135,4 +136,24 @@ public void onFailure(Throwable t) {
135136
hostLocalDirsCompletable.completeExceptionally(e);
136137
}
137138
}
139+
140+
/**
141+
* Push a sequence of shuffle blocks in a best-effort manner to a remote node asynchronously.
142+
* These shuffle blocks, along with blocks pushed by other clients, will be merged into
143+
* per-shuffle partition merged shuffle files on the destination node.
144+
*
145+
* @param host the host of the remote node.
146+
* @param port the port of the remote node.
147+
* @param blockIds block ids to be pushed
148+
* @param buffers buffers to be pushed
149+
* @param listener the listener to receive block push status.
150+
*/
151+
public void pushBlocks(
152+
String host,
153+
int port,
154+
String[] blockIds,
155+
ManagedBuffer[] buffers,
156+
BlockFetchingListener listener) {
157+
throw new UnsupportedOperationException();
158+
}
138159
}

0 commit comments

Comments
 (0)