Merge pull request #1566 from simonzhaoms/simonz/sarplus/spark3

Add Spark 3.x support for sarplus
recommenders-team · Dec 14, 2021 · 6f2088d · 6f2088d
2 parents 3b2b9a4 + 594674c
commit 6f2088d
Show file tree

Hide file tree

Showing 26 changed files with 681 additions and 292 deletions.
diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml
@@ -0,0 +1,159 @@
+# This workflow will run tests and do packaging for contrib/sarplus.
+#
+# References:
+#   * GitHub Actions workflow templates
+#       + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml)
+#       + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml)
+#   * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md)
+#   * [Azure Databricks runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases)
+
+
+name: sarplus test and package
+
+on:
+  push:
+    paths:
+      - contrib/sarplus/python/**
+      - contrib/sarplus/scala/**
+      - contrib/sarplus/VERSION
+      - .github/workflows/sarplus.yml
+
+env:
+  PYTHON_ROOT: ${{ github.workspace }}/contrib/sarplus/python
+  SCALA_ROOT: ${{ github.workspace }}/contrib/sarplus/scala
+
+jobs:
+  python:
+    # Test pysarplus with different versions of Python.
+    # Package pysarplus and upload as GitHub workflow artifact when merged into
+    # the main branch.
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U build pip twine
+          python -m pip install -U flake8 pytest pytest-cov scikit-learn
+
+      - name: Lint with flake8
+        run: |
+          cd "${PYTHON_ROOT}"
+          # See https://flake8.pycqa.org/en/latest/user/index.html
+          flake8 .
+
+      - name: Package and check
+        run: |
+          cd "${PYTHON_ROOT}"
+          cp ../VERSION ./pysarplus/
+          python -m build --sdist
+          python -m twine check dist/*
+
+      - name: Test
+        run: |
+          cd "${PYTHON_ROOT}"
+          python -m pip install dist/*.gz
+
+          cd "${SCALA_ROOT}"
+          export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2)
+          SPARK_JAR_DIR=$(python -m pip show pyspark | grep -i location | cut -d ' ' -f2)/pyspark/jars
+          SCALA_JAR=$(ls ${SPARK_JAR_DIR}/scala-library*)
+          HADOOP_JAR=$(ls ${SPARK_JAR_DIR}/hadoop-client-api*)
+          SCALA_VERSION=${SCALA_JAR##*-}
+          export SCALA_VERSION=${SCALA_VERSION%.*}
+          HADOOP_VERSION=${HADOOP_JAR##*-}
+          export HADOOP_VERSION=${HADOOP_VERSION%.*}
+          sbt ++"${SCALA_VERSION}"! package
+
+          cd "${PYTHON_ROOT}"
+          pytest ./tests
+          echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV
+
+      - name: Upload Python package as GitHub artifact
+        if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10'
+        uses: actions/upload-artifact@v2
+        with:
+          name: pysarplus-${{ env.sarplus_version }}
+          path: ${{ env.PYTHON_ROOT }}/dist/*.gz
+
+  scala-test:
+    # Test sarplus with different versions of Databricks runtime, 2 LTSs and 1
+    # latest.
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - scala-version: "2.12.10"
+            spark-version: "3.0.1"
+            hadoop-version: "2.7.4"
+            databricks-runtime: "ADB 7.3 LTS"
+
+          - scala-version: "2.12.10"
+            spark-version: "3.1.2"
+            hadoop-version: "2.7.4"
+            databricks-runtime: "ADB 9.1 LTS"
+
+          - scala-version: "2.12.14"
+            spark-version: "3.2.0"
+            hadoop-version: "3.3.1"
+            databricks-runtime: "ADB 10.0"
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Test
+        run: |
+          cd "${SCALA_ROOT}"
+          export SPARK_VERSION="${{ matrix.spark-version }}"
+          export HADOOP_VERSION="${{ matrix.hadoop-version }}"
+          sbt ++${{ matrix.scala-version }}! test
+
+  scala-package:
+    # Package sarplus and upload as GitHub workflow artifact when merged into
+    # the main branch.
+    needs: scala-test
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Package
+        env:
+          GPG_KEY: ${{ secrets.SARPLUS_GPG_PRI_KEY_ASC }}
+        run: |
+          # generate artifacts
+          cd "${SCALA_ROOT}"
+          export SPARK_VERSION="3.1.2"
+          export HADOOP_VERSION="2.7.4"
+          export SCALA_VERSION="2.12.10"
+          sbt ++${SCALA_VERSION}! package
+          sbt ++${SCALA_VERSION}! packageDoc
+          sbt ++${SCALA_VERSION}! packageSrc
+          sbt ++${SCALA_VERSION}! makePom
+          export SPARK_VERSION="3.2.0"
+          export HADOOP_VERSION="3.3.1"
+          export SCALA_VERSION="2.12.14"
+          sbt ++${SCALA_VERSION}! package
+
+          # sign with GPG
+          cd target/scala-2.12
+          gpg --import <(cat <<< "${GPG_KEY}")
+          for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
+
+          # bundle
+          jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc
+          echo "sarplus_version=$(cat ../VERSION)" >> $GITHUB_ENV
+          
+      - name: Upload Scala bundle  as GitHub artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: sarplus-bundle_2.12-${{ env.sarplus_version }}
+          path: ${{ env.SCALA_ROOT }}/target/scala-2.12/sarplus-bundle_2.12-${{ env.sarplus_version }}.jar
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -103,6 +103,8 @@ To contributors: please add your name to the list when you submit a patch to the
    * Windows test pipelines
 * **[Satyadev Ntv](https://github.com/satyadevntv)**
    * GeoIMC algorithm
+* **[Simon Zhao](https://github.com/simonzhaoms)**
+   * SARplus algorithm upgrade
 * **[Yan Zhang](https://github.com/YanZhangADS)**
    * Diversity metrics including coverage, novelty, diversity, and serendipity
    * Diversity metrics evaluation sample notebook

diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md
@@ -1,41 +1,109 @@
 # Packaging
 
-For [databricks](https://databricks.com/) to properly install a [C++ extension](https://docs.python.org/3/extending/building.html), one must take a detour through [pypi](https://pypi.org/).
-Use [twine](https://github.com/pypa/twine) to upload the package to [pypi](https://pypi.org/).
+For [databricks](https://databricks.com/) to properly install a [C++
+extension](https://docs.python.org/3/extending/building.html), one
+must take a detour through [pypi](https://pypi.org/).  Use
+[twine](https://github.com/pypa/twine) to upload the package to
+[pypi](https://pypi.org/).
 
 ```bash
-cd python
-
-python setup.py sdist
+# build dependencies
+python -m pip install -U build pip twine
 
-twine upload dist/pysarplus-*.tar.gz
+cd python
+cp ../VERSION ./pysarplus/  # version file
+python -m build --sdist
+python -m twine upload dist/*
 ```
 
-On [Spark](https://spark.apache.org/) one can install all 3 components (C++, Python, Scala) in one pass by creating a [Spark Package](https://spark-packages.org/). Documentation is rather sparse. Steps to install
+On [Spark](https://spark.apache.org/) one can install all 3 components
+(C++, Python, Scala) in one pass by creating a [Spark
+Package](https://spark-packages.org/).  Steps to install
 
 1. Package and publish the [pip package](python/setup.py) (see above)
-2. Package the [Spark package](scala/build.sbt), which includes the [Scala formatter](scala/src/main/scala/microsoft/sarplus) and references the [pip package](scala/python/requirements.txt) (see below)
-3. Upload the zipped Scala package to [Spark Package](https://spark-packages.org/) through a browser. [sbt spPublish](https://github.com/databricks/sbt-spark-package) has a few [issues](https://github.com/databricks/sbt-spark-package/issues/31) so it always fails for me. Don't use spPublishLocal as the packages are not created properly (names don't match up, [issue](https://github.com/databricks/sbt-spark-package/issues/17)) and furthermore fail to install if published to [Spark-Packages.org](https://spark-packages.org/).  
+2. Package the [Spark package](scala/build.sbt), which includes the
+   [Scala formatter](scala/src/main/scala/microsoft/sarplus) and
+   references the pip package (see below)
+3. Upload the zipped Scala package bundle to [Nexus Repository
+   Manager](https://oss.sonatype.org/) through a browser (See [publish
+   manul](https://central.sonatype.org/publish/publish-manual/)).
 
 ```bash
+export SPARK_VERSION="3.1.2"
+export HADOOP_VERSION="2.7.4"
+export SCALA_VERSION="2.12.10"
+GPG_KEY="<gpg-private-key>"
+
+# generate artifacts
 cd scala
-sbt spPublish
+sbt ++${SCALA_VERSION}! package
+sbt ++${SCALA_VERSION}! packageDoc
+sbt ++${SCALA_VERSION}! packageSrc
+sbt ++${SCALA_VERSION}! makePom
+
+# generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+
+export SPARK_VERSION="3.2.0"
+export HADOOP_VERSION="3.3.1"
+export SCALA_VERSION="2.12.14"
+sbt ++${SCALA_VERSION}! package
+
+# sign with GPG
+cd target/scala-${SCALA_VERSION%.*}
+gpg --import <(cat <<< "${GPG_KEY}")
+for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
+
+# bundle
+jar cvf sarplus-bundle_2.12-$(cat ../VERSION).jar *.jar *.pom *.asc
 ```
 
+where `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION` should be
+customized as needed.
+
+
 ## Testing
 
 To test the python UDF + C++ backend
 
 ```bash
-cd python 
-python setup.py install && pytest -s tests/
+# build dependencies
+python -m pip install -U build pip twine
+
+# build
+cd python
+cp ../VERSION ./pysarplus/  # version file
+python -m build --sdist
+
+# test
+pytest ./tests
 ```
 
 To test the Scala formatter
 
 ```bash
+export SPARK_VERSION=3.2.0
+export HADOOP_VERSION=3.3.1
+export SCALA_VERSION=2.12.14
+
 cd scala
-sbt test
+sbt ++${SCALA_VERSION}! test
 ```
 
-(use ~test and it will automatically check for changes in source files, but not build.sbt)
+
+## Notes for Spark 3.x  ##
+
+The code now has been modified to support Spark 3.x, and has been
+tested under different versions of Databricks Runtime (including 6.4
+Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks
+Service.  However, there is a breaking change of
+[org/apache.spark.sql.execution.datasources.OutputWriter](https://github.com/apache/spark/blob/dc0fa1eef74238d745dabfdc86705b59d95b07e1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala#L74)
+on **Spark 3.2**, which adds an extra function `path()`, so an
+additional JAR file with the classifier `spark32` will be needed if
+running on Spark 3.2 (See above for packaging).
+
+Also, extra configurations are also required when running on Spark
+3.x:
+
+```
+spark.sql.sources.default parquet
+spark.sql.legacy.createHiveTableByDefault true
+```