diff --git a/.github/actions/setup-spark-builder/action.yaml b/.github/actions/setup-spark-builder/action.yaml index ae3d3146ca..ebe8e0dc37 100644 --- a/.github/actions/setup-spark-builder/action.yaml +++ b/.github/actions/setup-spark-builder/action.yaml @@ -29,7 +29,7 @@ inputs: comet-version: description: 'The Comet version to use for Spark' required: true - default: '0.3.0-SNAPSHOT' + default: '0.4.0-SNAPSHOT' runs: using: "composite" steps: diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 0fc86d330c..70b452987d 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -44,7 +44,7 @@ jobs: - name: Extract Comet version id: extract_version run: | - # use the tag that triggered this workflow as the Comet version e.g. 0.3.0-rc1 + # use the tag that triggered this workflow as the Comet version e.g. 0.4.0-rc1 echo "COMET_VERSION=${GITHUB_REF##*/}" >> $GITHUB_ENV - name: Echo Comet version run: echo "The current Comet version is ${{ env.COMET_VERSION }}" diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index bb81fbd55e..05c095b2f5 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -71,7 +71,7 @@ jobs: with: spark-version: ${{ matrix.spark-version.full }} spark-short-version: ${{ matrix.spark-version.short }} - comet-version: '0.3.0-SNAPSHOT' # TODO: get this from pom.xml + comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml - name: Run Spark tests run: | cd apache-spark diff --git a/.github/workflows/spark_sql_test_ansi.yml b/.github/workflows/spark_sql_test_ansi.yml index db2c5708a0..06a5b2c8e7 100644 --- a/.github/workflows/spark_sql_test_ansi.yml +++ b/.github/workflows/spark_sql_test_ansi.yml @@ -69,7 +69,7 @@ jobs: with: spark-version: ${{ matrix.spark-version.full }} spark-short-version: ${{ matrix.spark-version.short }} - comet-version: '0.3.0-SNAPSHOT' # TODO: get this from pom.xml + comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml - name: Run Spark tests run: | cd apache-spark diff --git a/common/pom.xml b/common/pom.xml index 0ac8cece5d..cda873abbc 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.3.0-SNAPSHOT + 0.4.0-SNAPSHOT ../pom.xml diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff index d3d666329d..6ac55f95c9 100644 --- a/dev/diffs/3.4.3.diff +++ b/dev/diffs/3.4.3.diff @@ -7,7 +7,7 @@ index d3544881af1..bf0e2b53c70 100644 2.5.1 2.0.8 + 3.4 -+ 0.3.0-SNAPSHOT ++ 0.4.0-SNAPSHOT -# Aapche DataFusion Comet: Source Release Process +# Apache DataFusion Comet: Release Process -This documentation is for creating an official source release of Apache DataFusion Comet. +This documentation explains the release process for Apache DataFusion Comet. ## Creating the Release Candidate @@ -49,12 +49,18 @@ git checkout -b branch-0.1 git push apache branch-0.1 ``` -Create and merge a PR against the release branch to update the Maven version from `0.3.0-SNAPSHOT` to `0.1.0` +Update the `pom.xml` files in the release branch to update the Maven version from `0.1.0-SNAPSHOT` to `0.1.0`. + +There is no need to update the Rust crate versions because they will already be `0.1.0`. ### Update Version in main -Create a PR against the main branch to update the Rust crate version to `0.2.0` and the Maven version to `0.2.0-SNAPSHOT`. -The Spark diffs also need updating. +Create a PR against the main branch to prepare for developing the next release: + +- Update the Rust crate version to `0.2.0`. +- Update the Maven version to `0.2.0-SNAPSHOT` (both in the `pom.xml` files and also in the diff files + under `dev/diffs`). +- Update the CI scripts under the `.github` directory. ### Generate the Change Log @@ -81,13 +87,14 @@ python3 generate-changelog.py 0.0.0 HEAD 0.1.0 > ../changelog/0.1.0.md Create a PR against the _main_ branch to add this change log and once this is approved and merged, cherry-pick the commit into the release branch. -### Build the jars +### Build the jars #### Setup to do the build - The build process requires Docker. Download the latest Docker Desktop from https://www.docker.com/products/docker-desktop/. - If you have multiple docker contexts running switch to the context of the Docker Desktop. For example - - ```shell +The build process requires Docker. Download the latest Docker Desktop from https://www.docker.com/products/docker-desktop/. +If you have multiple docker contexts running switch to the context of the Docker Desktop. For example - + +```shell $ docker context ls NAME DESCRIPTION DOCKER ENDPOINT ERROR default Current DOCKER_HOST based configuration unix:///var/run/docker.sock @@ -95,12 +102,14 @@ desktop-linux Docker Desktop unix:///Users/parth/ my_custom_context * tcp://192.168.64.2:2376 $ docker context use desktop-linux - ``` +``` + #### Run the build script - The `build-release-comet.sh` script will create a docker image for each architecture and use the image + +The `build-release-comet.sh` script will create a docker image for each architecture and use the image to build the platform specific binaries. These builder images are created every time this script is run. -The script optionally allows overriding of the repository and branch to build the binaries from (Note that - the local git repo is not used in the building of the binaries, but it is used to build the final uber jar). +The script optionally allows overriding of the repository and branch to build the binaries from (Note that +the local git repo is not used in the building of the binaries, but it is used to build the final uber jar). ```shell Usage: build-release-comet.sh [options] @@ -122,8 +131,10 @@ cd dev/release && ./build-release-comet.sh && cd ../.. ``` #### Build output - The build output is installed to a temporary local maven repository. The build script will print the name of the repository -location at the end. This location will be required at the time of deploying the artifacts to a staging repository + +The build output is installed to a temporary local maven repository. The build script will print the name of the +repository location at the end. This location will be required at the time of deploying the artifacts to a staging +repository ### Tag the Release Candidate @@ -137,27 +148,28 @@ git tag 0.1.0-rc1 git push apache 0.1.0-rc1 ``` +Note that pushing a release candidate tag will trigger a GitHub workflow that will build a Docker image and publish +it to GitHub Container Registry at https://github.com/apache/datafusion-comet/pkgs/container/datafusion-comet + ## Publishing the Release Candidate This part of the process can mostly only be performed by a PMC member. -### Create the Release Candidate Tarball - -Run the create-tarball script on the release candidate tag (`0.1.0-rc1`) to create the source tarball and upload it to the dev subversion repository - -```shell -GH_TOKEN= ./dev/release/create-tarball.sh 0.1.0 1 -``` - ### Publish the maven artifacts + #### Setup maven + ##### One time project setup + Setting up your project in the ASF Nexus Repository from here: https://infra.apache.org/publishing-maven-artifacts.html + ##### Release Manager Setup -Set up your development environment from here: https://infra.apache.org/publishing-maven-artifacts.html + +Set up your development environment from here: https://infra.apache.org/publishing-maven-artifacts.html ##### Build and publish a release candidate to nexus. -The script `publish-to-maven.sh` will publish the artifacts created by the `build-release-comet.sh` script. + +The script `publish-to-maven.sh` will publish the artifacts created by the `build-release-comet.sh` script. The artifacts will be signed using the gpg key of the release manager and uploaded to the maven staging repository. Note: This script needs `xmllint` to be installed. On MacOS xmllint is available by default. @@ -183,7 +195,8 @@ GPG_KEY - GPG key used to sign release artifacts GPG_PASSPHRASE - Passphrase for GPG key ``` -example +example + ```shell /comet:$./dev/release/publish-to-maven.sh -u release_manager_asf_id -r /tmp/comet-staging-repo-VsYOX ASF Password : @@ -193,23 +206,56 @@ Creating Nexus staging repository ... ``` -In the Nexus repository UI (https://repository.apache.org/) locate and verify the artifacts in +In the Nexus repository UI (https://repository.apache.org/) locate and verify the artifacts in staging (https://central.sonatype.org/publish/release/#locate-and-examine-your-staging-repository). -If the artifacts appear to be correct, then close and release the repository so it is made visible. +If the artifacts appear to be correct, then close and release the repository so it is made visible (this should +actually happen automatically when running the script). + +### Create the Release Candidate Tarball + +Run the create-tarball script on the release candidate tag (`0.1.0-rc1`) to create the source tarball and upload it to +the dev subversion repository + +```shell +./dev/release/create-tarball.sh 0.1.0 1 +``` + +This will generate an email template for starting the vote. ### Start an Email Voting Thread Send the email that is generated in the previous step to `dev@datafusion.apache.org`. -### Publish the Release Tarball +## Publishing Binary Releases + +Once the vote passes, we can publish the source and binary releases. -Once the vote passes, run the release-tarball script to move the tarball to the release subversion repository. +### Publishing Source Tarball + +Run the release-tarball script to move the tarball to the release subversion repository. ```shell -./dev/release/create-tarball.sh 0.1.0 1 +./dev/release/release-tarball.sh 0.1.0 1 ``` +### Create a release in the GitHub repository + +Go to https://github.com/apache/datafusion-comet/releases and create a release for the release tag, and paste the +changelog in the description. + +### Publishing Maven Artifacts + +Promote the Maven artifacts from staging to production by visiting https://repository.apache.org/#stagingRepositories +and selecting the staging repository and then clicking the "release" button. + +### Publishing Crates + +Publish the `datafusion-comet-spark-expr` crate to crates.io so that other Rust projects can leverage the +Spark-compatible operators and expressions outside of Spark. + +### Push a release tag to the repo + Push a release tag (`0.1.0`) to the `apache` repository. ```shell @@ -219,6 +265,9 @@ git tag 0.1.0 git push apache 0.1.0 ``` +Note that pushing a release tag will trigger a GitHub workflow that will build a Docker image and publish +it to GitHub Container Registry at https://github.com/apache/datafusion-comet/pkgs/container/datafusion-comet + Reply to the vote thread to close the vote and announce the release. ## Post Release Admin @@ -260,20 +309,9 @@ svn ls https://dist.apache.org/repos/dist/release/datafusion | grep comet Delete a release: ```shell -svn delete -m "delete old DataFusion Comet release" https://dist.apache.org/repos/dist/release/datafusion-comet/datafusion-comet-0.0.0 +svn delete -m "delete old DataFusion Comet release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-comet-0.0.0 ``` -## Publishing Binary Releases - -### Publishing JAR Files to Maven - -Once the vote has passed, promote the staged release candidate to production in the Nexus repository UI (https://repository.apache.org/). - -### Publishing to crates.io - -We may choose to publish the `datafusion-comet` to crates.io so that other Rust projects can leverage the -Spark-compatible operators and expressions outside of Spark. - ## Post Release Activities Writing a blog post about the release is a great way to generate more interest in the project. We typically create a diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index 1bec80051c..6387cf485a 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -53,11 +53,6 @@ if [ "$#" -ne 2 ]; then exit fi -if [[ -z "${GH_TOKEN}" ]]; then - echo "Please set personal github token through GH_TOKEN environment variable" - exit -fi - version=$1 rc=$2 tag="${version}-rc${rc}" @@ -87,7 +82,8 @@ I would like to propose a release of Apache DataFusion Comet version ${version}. This release candidate is based on commit: ${release_hash} [1] The proposed release tarball and signatures are hosted at [2]. -The changelog is located at [3]. +Pre-built jar files are available in a Maven staging repository [3]. +The changelog is located at [4]. Please download, verify checksums and signatures, run the unit tests, and vote on the release. The vote will be open for at least 72 hours. @@ -107,7 +103,8 @@ Here is my vote: [1]: https://github.com/apache/datafusion-comet/tree/${release_hash} [2]: ${url} -[3]: https://github.com/apache/datafusion-comet/blob/${release_hash}/CHANGELOG.md +[3]: https://repository.apache.org/#nexus-search;quick~org.apache.datafusion +[4]: https://github.com/apache/datafusion-comet/blob/${release_hash}/CHANGELOG.md MAIL echo "---------------------------------------------------------" @@ -121,7 +118,7 @@ echo "Running rat license checker on ${tarball}" ${DEV_RELEASE_DIR}/run-rat.sh ${tarball} echo "Signing tarball and creating checksums" -gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +gpg --pinentry-mode loopback --armor --output ${tarball}.asc --detach-sig ${tarball} # create signing with relative path of tarball # so that they can be verified with a command such as # shasum --check apache-datafusion-comet-0.1.0-rc1.tar.gz.sha512 diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 291c7b786a..07c8c1b0d7 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -2,18 +2,25 @@ *.dockerignore .github/pull_request_template.md .gitmodules -core/Cargo.lock -core/testdata/backtrace.txt -core/testdata/stacktrace.txt +native/Cargo.lock +native/testdata/backtrace.txt +native/testdata/stacktrace.txt +dev/copyright/scala-header.txt dev/release/requirements.txt +dev/release/rat_exclude_files.txt docs/spark_builtin_expr_coverage.txt +docs/source/_static/images/*.svg docs/source/contributor-guide/benchmark-results/**/*.json docs/logos/*.png docs/logos/*.svg rust-toolchain +spark/src/test/resources/tpcds-extended/q*.sql spark/src/test/resources/tpcds-query-results/*.out +spark/src/test/resources/tpcds-micro-benchmarks/*.sql spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/explain.txt spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/simplified.txt spark/src/test/resources/tpch-query-results/*.out -spark/src/test/resources/tpch-extended/q1.sql +spark/src/test/resources/tpch-extended/q*.sql +spark/src/test/resources/test-data/*.csv +spark/src/test/resources/test-data/*.ndjson spark/inspections/CometTPC*results.txt diff --git a/dev/release/verifying-release-candidates.md b/dev/release/verifying-release-candidates.md index ca93ad5040..fdaa5a9b32 100644 --- a/dev/release/verifying-release-candidates.md +++ b/dev/release/verifying-release-candidates.md @@ -37,6 +37,8 @@ make release-nogit We hope that users will verify the release beyond running this script by testing the release candidate with their existing Spark jobs and report any functional issues or performance regressions. +The email announcing the vote should contain a link to pre-built jar files in a Maven staging repository. + Another way of verifying the release is to follow the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html) and compare performance with the previous release. diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md index a0a7aa7b27..dc4429b8b9 100644 --- a/docs/source/user-guide/installation.md +++ b/docs/source/user-guide/installation.md @@ -38,7 +38,7 @@ See the [Comet Kubernetes Guide](kubernetes.md) guide. ## Using a Published JAR File -There are no published JAR files yet. +Pre-built jar files are available in Maven central at https://central.sonatype.com/namespace/org.apache.datafusion ## Using a Published Source Release @@ -46,7 +46,7 @@ Official source releases can be downloaded from https://dist.apache.org/repos/di ```console # Pick the latest version -export COMET_VERSION=0.2.0 +export COMET_VERSION=0.3.0 # Download the tarball curl -O "https://dist.apache.org/repos/dist/release/datafusion/datafusion-comet-$COMET_VERSION/apache-datafusion-comet-$COMET_VERSION.tar.gz" # Unpack diff --git a/fuzz-testing/pom.xml b/fuzz-testing/pom.xml index 1e6ad326fc..11f57700dd 100644 --- a/fuzz-testing/pom.xml +++ b/fuzz-testing/pom.xml @@ -25,7 +25,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.3.0-SNAPSHOT + 0.4.0-SNAPSHOT ../pom.xml diff --git a/native/Cargo.lock b/native/Cargo.lock index e55ea46e0d..c9301c6e3d 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -870,7 +870,7 @@ dependencies = [ [[package]] name = "datafusion-comet" -version = "0.3.0" +version = "0.4.0" dependencies = [ "ahash", "arrow", @@ -922,7 +922,7 @@ dependencies = [ [[package]] name = "datafusion-comet-proto" -version = "0.3.0" +version = "0.4.0" dependencies = [ "prost 0.12.6", "prost-build", @@ -930,7 +930,7 @@ dependencies = [ [[package]] name = "datafusion-comet-spark-expr" -version = "0.3.0" +version = "0.4.0" dependencies = [ "arrow", "arrow-array", diff --git a/native/Cargo.toml b/native/Cargo.toml index 6c7d235e44..87c988aaf2 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -20,7 +20,7 @@ members = ["core", "spark-expr", "proto"] resolver = "2" [workspace.package] -version = "0.3.0" +version = "0.4.0" homepage = "https://datafusion.apache.org/comet" repository = "https://github.com/apache/datafusion-comet" authors = ["Apache DataFusion "] @@ -47,8 +47,8 @@ datafusion-expr = { version = "42.0.0", default-features = false } datafusion-execution = { version = "42.0.0", default-features = false } datafusion-physical-plan = { version = "42.0.0", default-features = false } datafusion-physical-expr = { version = "42.0.0", default-features = false } -datafusion-comet-spark-expr = { path = "spark-expr", version = "0.3.0" } -datafusion-comet-proto = { path = "proto", version = "0.3.0" } +datafusion-comet-spark-expr = { path = "spark-expr", version = "0.4.0" } +datafusion-comet-proto = { path = "proto", version = "0.4.0" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8" } num = "0.4" diff --git a/native/README.md b/native/README.md new file mode 100644 index 0000000000..714f0f2d6b --- /dev/null +++ b/native/README.md @@ -0,0 +1,26 @@ + + +# Apache DataFusion Comet Native Code + +This project contains the following crates: + +- [core](core): Native code used by the Comet Spark plugin +- [proto](proto): Comet protocol buffer definition for query plans +- [spark-expr](spark-expr): Spark-compatible DataFusion operators and expressions \ No newline at end of file diff --git a/pom.xml b/pom.xml index 2893b705da..ed0a8afb2f 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.3.0-SNAPSHOT + 0.4.0-SNAPSHOT pom Comet Project Parent POM diff --git a/spark-integration/pom.xml b/spark-integration/pom.xml index 1175ad0069..a9e1619a1a 100644 --- a/spark-integration/pom.xml +++ b/spark-integration/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.3.0-SNAPSHOT + 0.4.0-SNAPSHOT ../pom.xml diff --git a/spark/pom.xml b/spark/pom.xml index e4f3396036..d1bf0fa2f8 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -26,7 +26,7 @@ under the License. org.apache.datafusion comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.3.0-SNAPSHOT + 0.4.0-SNAPSHOT ../pom.xml