-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1566 from simonzhaoms/simonz/sarplus/spark3
Add Spark 3.x support for sarplus
- Loading branch information
Showing
26 changed files
with
681 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
# This workflow will run tests and do packaging for contrib/sarplus. | ||
# | ||
# References: | ||
# * GitHub Actions workflow templates | ||
# + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) | ||
# + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) | ||
# * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md) | ||
# * [Azure Databricks runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) | ||
|
||
|
||
name: sarplus test and package | ||
|
||
on: | ||
push: | ||
paths: | ||
- contrib/sarplus/python/** | ||
- contrib/sarplus/scala/** | ||
- contrib/sarplus/VERSION | ||
- .github/workflows/sarplus.yml | ||
|
||
env: | ||
PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python | ||
SCALA_ROOT: ${{ github.workspace }}/contrib/sarplus/scala | ||
|
||
jobs: | ||
python: | ||
# Test pysarplus with different versions of Python. | ||
# Package pysarplus and upload as GitHub workflow artifact when merged into | ||
# the main branch. | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] | ||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install -U build pip twine | ||
python -m pip install -U flake8 pytest pytest-cov scikit-learn | ||
- name: Lint with flake8 | ||
run: | | ||
cd "${PYTHON_ROOT}" | ||
# See https://flake8.pycqa.org/en/latest/user/index.html | ||
flake8 . | ||
- name: Package and check | ||
run: | | ||
cd "${PYTHON_ROOT}" | ||
cp ../VERSION ./pysarplus/ | ||
python -m build --sdist | ||
python -m twine check dist/* | ||
- name: Test | ||
run: | | ||
cd "${PYTHON_ROOT}" | ||
python -m pip install dist/*.gz | ||
cd "${SCALA_ROOT}" | ||
export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2) | ||
SPARK_JAR_DIR=$(python -m pip show pyspark | grep -i location | cut -d ' ' -f2)/pyspark/jars | ||
SCALA_JAR=$(ls ${SPARK_JAR_DIR}/scala-library*) | ||
HADOOP_JAR=$(ls ${SPARK_JAR_DIR}/hadoop-client-api*) | ||
SCALA_VERSION=${SCALA_JAR##*-} | ||
export SCALA_VERSION=${SCALA_VERSION%.*} | ||
HADOOP_VERSION=${HADOOP_JAR##*-} | ||
export HADOOP_VERSION=${HADOOP_VERSION%.*} | ||
sbt ++"${SCALA_VERSION}"! package | ||
cd "${PYTHON_ROOT}" | ||
pytest ./tests | ||
echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV | ||
- name: Upload Python package as GitHub artifact | ||
if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10' | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: pysarplus-${{ env.sarplus_version }} | ||
path: ${{ env.PYTHON_ROOT }}/dist/*.gz | ||
|
||
scala-test: | ||
# Test sarplus with different versions of Databricks runtime, 2 LTSs and 1 | ||
# latest. | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
include: | ||
- scala-version: "2.12.10" | ||
spark-version: "3.0.1" | ||
hadoop-version: "2.7.4" | ||
databricks-runtime: "ADB 7.3 LTS" | ||
|
||
- scala-version: "2.12.10" | ||
spark-version: "3.1.2" | ||
hadoop-version: "2.7.4" | ||
databricks-runtime: "ADB 9.1 LTS" | ||
|
||
- scala-version: "2.12.14" | ||
spark-version: "3.2.0" | ||
hadoop-version: "3.3.1" | ||
databricks-runtime: "ADB 10.0" | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Test | ||
run: | | ||
cd "${SCALA_ROOT}" | ||
export SPARK_VERSION="${{ matrix.spark-version }}" | ||
export HADOOP_VERSION="${{ matrix.hadoop-version }}" | ||
sbt ++${{ matrix.scala-version }}! test | ||
scala-package: | ||
# Package sarplus and upload as GitHub workflow artifact when merged into | ||
# the main branch. | ||
needs: scala-test | ||
if: github.ref == 'refs/heads/main' | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Package | ||
env: | ||
GPG_KEY: ${{ secrets.SARPLUS_GPG_PRI_KEY_ASC }} | ||
run: | | ||
# generate artifacts | ||
cd "${SCALA_ROOT}" | ||
export SPARK_VERSION="3.1.2" | ||
export HADOOP_VERSION="2.7.4" | ||
export SCALA_VERSION="2.12.10" | ||
sbt ++${SCALA_VERSION}! package | ||
sbt ++${SCALA_VERSION}! packageDoc | ||
sbt ++${SCALA_VERSION}! packageSrc | ||
sbt ++${SCALA_VERSION}! makePom | ||
export SPARK_VERSION="3.2.0" | ||
export HADOOP_VERSION="3.3.1" | ||
export SCALA_VERSION="2.12.14" | ||
sbt ++${SCALA_VERSION}! package | ||
# sign with GPG | ||
cd target/scala-2.12 | ||
gpg --import <(cat <<< "${GPG_KEY}") | ||
for file in {*.jar,*.pom}; do gpg -ab "${file}"; done | ||
# bundle | ||
jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc | ||
echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV | ||
- name: Upload Scala bundle as GitHub artifact | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: sarplus-bundle_2.12-${{ env.sarplus_version }} | ||
path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.sarplus_version }}.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,109 @@ | ||
# Packaging | ||
|
||
For [databricks](https://databricks.com/) to properly install a [C++ extension](https://docs.python.org/3/extending/building.html), one must take a detour through [pypi](https://pypi.org/). | ||
Use [twine](https://github.com/pypa/twine) to upload the package to [pypi](https://pypi.org/). | ||
For [databricks](https://databricks.com/) to properly install a [C++ | ||
extension](https://docs.python.org/3/extending/building.html), one | ||
must take a detour through [pypi](https://pypi.org/). Use | ||
[twine](https://github.com/pypa/twine) to upload the package to | ||
[pypi](https://pypi.org/). | ||
|
||
```bash | ||
cd python | ||
|
||
python setup.py sdist | ||
# build dependencies | ||
python -m pip install -U build pip twine | ||
|
||
twine upload dist/pysarplus-*.tar.gz | ||
cd python | ||
cp ../VERSION ./pysarplus/ # version file | ||
python -m build --sdist | ||
python -m twine upload dist/* | ||
``` | ||
|
||
On [Spark](https://spark.apache.org/) one can install all 3 components (C++, Python, Scala) in one pass by creating a [Spark Package](https://spark-packages.org/). Documentation is rather sparse. Steps to install | ||
On [Spark](https://spark.apache.org/) one can install all 3 components | ||
(C++, Python, Scala) in one pass by creating a [Spark | ||
Package](https://spark-packages.org/). Steps to install | ||
|
||
1. Package and publish the [pip package](python/setup.py) (see above) | ||
2. Package the [Spark package](scala/build.sbt), which includes the [Scala formatter](scala/src/main/scala/microsoft/sarplus) and references the [pip package](scala/python/requirements.txt) (see below) | ||
3. Upload the zipped Scala package to [Spark Package](https://spark-packages.org/) through a browser. [sbt spPublish](https://github.com/databricks/sbt-spark-package) has a few [issues](https://github.com/databricks/sbt-spark-package/issues/31) so it always fails for me. Don't use spPublishLocal as the packages are not created properly (names don't match up, [issue](https://github.com/databricks/sbt-spark-package/issues/17)) and furthermore fail to install if published to [Spark-Packages.org](https://spark-packages.org/). | ||
2. Package the [Spark package](scala/build.sbt), which includes the | ||
[Scala formatter](scala/src/main/scala/microsoft/sarplus) and | ||
references the pip package (see below) | ||
3. Upload the zipped Scala package bundle to [Nexus Repository | ||
Manager](https://oss.sonatype.org/) through a browser (See [publish | ||
manul](https://central.sonatype.org/publish/publish-manual/)). | ||
|
||
```bash | ||
export SPARK_VERSION="3.1.2" | ||
export HADOOP_VERSION="2.7.4" | ||
export SCALA_VERSION="2.12.10" | ||
GPG_KEY="<gpg-private-key>" | ||
|
||
# generate artifacts | ||
cd scala | ||
sbt spPublish | ||
sbt ++${SCALA_VERSION}! package | ||
sbt ++${SCALA_VERSION}! packageDoc | ||
sbt ++${SCALA_VERSION}! packageSrc | ||
sbt ++${SCALA_VERSION}! makePom | ||
|
||
# generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+ | ||
export SPARK_VERSION="3.2.0" | ||
export HADOOP_VERSION="3.3.1" | ||
export SCALA_VERSION="2.12.14" | ||
sbt ++${SCALA_VERSION}! package | ||
|
||
# sign with GPG | ||
cd target/scala-${SCALA_VERSION%.*} | ||
gpg --import <(cat <<< "${GPG_KEY}") | ||
for file in {*.jar,*.pom}; do gpg -ab "${file}"; done | ||
|
||
# bundle | ||
jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc | ||
``` | ||
|
||
where `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` should be | ||
customized as needed. | ||
|
||
|
||
## Testing | ||
|
||
To test the python UDF + C++ backend | ||
|
||
```bash | ||
cd python | ||
python setup.py install && pytest -s tests/ | ||
# build dependencies | ||
python -m pip install -U build pip twine | ||
|
||
# build | ||
cd python | ||
cp ../VERSION ./pysarplus/ # version file | ||
python -m build --sdist | ||
|
||
# test | ||
pytest ./tests | ||
``` | ||
|
||
To test the Scala formatter | ||
|
||
```bash | ||
export SPARK_VERSION=3.2.0 | ||
export HADOOP_VERSION=3.3.1 | ||
export SCALA_VERSION=2.12.14 | ||
|
||
cd scala | ||
sbt test | ||
sbt ++${SCALA_VERSION}! test | ||
``` | ||
|
||
(use ~test and it will automatically check for changes in source files, but not build.sbt) | ||
|
||
## Notes for Spark 3.x ## | ||
|
||
The code now has been modified to support Spark 3.x, and has been | ||
tested under different versions of Databricks Runtime (including 6.4 | ||
Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks | ||
Service. However, there is a breaking change of | ||
[org/apache.spark.sql.execution.datasources.OutputWriter](https://github.com/apache/spark/blob/dc0fa1eef74238d745dabfdc86705b59d95b07e1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala#L74) | ||
on **Spark 3.2**, which adds an extra function `path()`, so an | ||
additional JAR file with the classifier `spark32` will be needed if | ||
running on Spark 3.2 (See above for packaging). | ||
|
||
Also, extra configurations are also required when running on Spark | ||
3.x: | ||
|
||
``` | ||
spark.sql.sources.default parquet | ||
spark.sql.legacy.createHiveTableByDefault true | ||
``` |
Oops, something went wrong.