diff --git a/CMake/resolve_dependency_modules/cudf.cmake b/CMake/resolve_dependency_modules/cudf.cmake index f35430431a6..6ed18c6f5ff 100644 --- a/CMake/resolve_dependency_modules/cudf.cmake +++ b/CMake/resolve_dependency_modules/cudf.cmake @@ -17,18 +17,50 @@ include_guard(GLOBAL) # 3.30.4 is the minimum version required by cudf cmake_minimum_required(VERSION 3.30.4) -# Add velox_resolve_dependency_url here for rapids-cmake, rmm, and kvikio if a specific version or commit is needed. +# rapids_cmake commit 5ec2245 from 2026-01-26 +set(VELOX_rapids_cmake_VERSION 26.04) +set(VELOX_rapids_cmake_COMMIT 5ec22457e58953e0a68f0745ce7a11a896ba62b1) +set( + VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM + bf7d4ed5885f5fe012c42fb0977e1fe1416896479ffd34baa0cf762d3e83dc80 +) +set( + VELOX_rapids_cmake_SOURCE_URL + "https://github.com/rapidsai/rapids-cmake/archive/${VELOX_rapids_cmake_COMMIT}.tar.gz" +) +velox_resolve_dependency_url(rapids_cmake) -set(VELOX_cudf_VERSION 26.02 CACHE STRING "cudf version") +# rmm commit e728b29 from 2026-01-26 +set(VELOX_rmm_VERSION 26.04) +set(VELOX_rmm_COMMIT e728b2923f748d71aad30294b6926f43cb4c826e) +set( + VELOX_rmm_BUILD_SHA256_CHECKSUM + ec18d881b327514de154af67a33a1288eec7bcd86909f23c9bf2d90511b0cf2f +) +set(VELOX_rmm_SOURCE_URL "https://github.com/rapidsai/rmm/archive/${VELOX_rmm_COMMIT}.tar.gz") +velox_resolve_dependency_url(rmm) +# kvikio commit 0f03349 from 2026-01-26 +set(VELOX_kvikio_VERSION 26.04) +set(VELOX_kvikio_COMMIT 0f03349bcaf029a2f582d9915a88d09e355ac691) set( - VELOX_cudf_BUILD_SHA256_CHECKSUM - 96b54c2b33281f58183978429933740869ef384d2687308699a257b05076d4fd + VELOX_kvikio_BUILD_SHA256_CHECKSUM + 728868c671e2686b5e9b7b4122d1661475f803c4fb98c0852d7be65c365d7b2d ) set( - VELOX_cudf_SOURCE_URL - "https://github.com/rapidsai/cudf/archive/3f85f626633ca4202941cc3bf3112bcd319eab8e.tar.gz" + VELOX_kvikio_SOURCE_URL + "https://github.com/rapidsai/kvikio/archive/${VELOX_kvikio_COMMIT}.tar.gz" ) +velox_resolve_dependency_url(kvikio) + +# cudf commit 68a0714 from 2026-01-27 +set(VELOX_cudf_VERSION 26.04 CACHE STRING "cudf version") +set(VELOX_cudf_COMMIT 68a0714a3701431041cb47bf1163706f597f9f48) +set( + VELOX_cudf_BUILD_SHA256_CHECKSUM + 0c723d7fd04eab60336dd4bcce41e225821d13b54cdabe485ec54517f3aa8b15 +) +set(VELOX_cudf_SOURCE_URL "https://github.com/rapidsai/cudf/archive/${VELOX_cudf_COMMIT}.tar.gz") velox_resolve_dependency_url(cudf) # Use block so we don't leak variables @@ -38,7 +70,30 @@ block(SCOPE_FOR VARIABLES) set(CUDF_BUILD_TESTUTIL OFF) set(BUILD_SHARED_LIBS ON) - # Add FetchContent_Declare here for rapids-cmake, rmm, and kvikio if a specific version or commit is needed. + FetchContent_Declare( + rapids-cmake + URL ${VELOX_rapids_cmake_SOURCE_URL} + URL_HASH ${VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM} + UPDATE_DISCONNECTED 1 + ) + + FetchContent_Declare( + rmm + URL ${VELOX_rmm_SOURCE_URL} + URL_HASH ${VELOX_rmm_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR + cpp + UPDATE_DISCONNECTED 1 + ) + + FetchContent_Declare( + kvikio + URL ${VELOX_kvikio_SOURCE_URL} + URL_HASH ${VELOX_kvikio_BUILD_SHA256_CHECKSUM} + SOURCE_SUBDIR + cpp + UPDATE_DISCONNECTED 1 + ) FetchContent_Declare( cudf diff --git a/scripts/update-cudf-deps.sh b/scripts/update-cudf-deps.sh new file mode 100755 index 00000000000..31934a56246 --- /dev/null +++ b/scripts/update-cudf-deps.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +usage() { + cat < + $0 --pr + $0 --commit + +Options: + --branch Update all cudf dependencies to latest from branch + --pr Update only cudf from a specific PR + --commit Update all dependencies using cudf commit and compatible versions + +Examples: + $0 --branch main + $0 --branch release/26.02 + $0 --pr 12345 + $0 --commit abc123def456 +EOF +} + +[[ $# -eq 0 ]] && usage && exit 1 + +MODE="$1" +ARG="${2:-}" +[[ -z $ARG ]] && echo "Error: $MODE requires an argument" && usage && exit 1 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CMAKE_FILE="$SCRIPT_DIR/../CMake/resolve_dependency_modules/cudf.cmake" + +get_commit_info() { + local repo=$1 branch=$2 + curl -sf "https://api.github.com/repos/rapidsai/${repo}/commits/${branch}" | + jq -r '[.sha, .commit.committer.date[:10]] | join(" ")' +} + +get_commit_before_date() { + local repo=$1 until_date=$2 + curl -sf "https://api.github.com/repos/rapidsai/${repo}/commits?sha=main&until=${until_date}&per_page=1" | + jq -r '.[0] | [.sha, .commit.committer.date[:10]] | join(" ")' +} + +get_sha256() { + curl -sL "https://github.com/rapidsai/$1/archive/$2.tar.gz" | sha256sum | cut -d' ' -f1 +} + +get_version() { + local branch=$1 + if [[ $branch =~ ^release/([0-9]+\.[0-9]+)$ ]]; then + echo "${BASH_REMATCH[1]}" + else + curl -sf "https://raw.githubusercontent.com/rapidsai/cudf/${branch}/VERSION" | + grep -oP '^[0-9]+\.[0-9]+' + fi +} + +update_dependency() { + local var=$1 commit=$2 date=$3 checksum=$4 + sed -i "s/# ${var} commit [a-f0-9]* from [0-9-]*/# ${var} commit ${commit:0:7} from ${date}/" "$CMAKE_FILE" + sed -i "s/set(VELOX_${var}_COMMIT [a-f0-9]*)/set(VELOX_${var}_COMMIT ${commit})/" "$CMAKE_FILE" + + if [[ $var == "cudf" ]]; then + sed -i "s/set(VELOX_${var}_VERSION [0-9.]* CACHE/set(VELOX_${var}_VERSION ${VERSION} CACHE/" "$CMAKE_FILE" + else + sed -i "s/set(VELOX_${var}_VERSION [0-9.]*)/set(VELOX_${var}_VERSION ${VERSION})/" "$CMAKE_FILE" + fi + + awk -v var="VELOX_${var}_BUILD_SHA256_CHECKSUM" -v sum="$checksum" ' + $0 ~ var { found=1 } + found && /^[[:space:]]*[a-f0-9]{64}[[:space:]]*$/ { sub(/[a-f0-9]{64}/, sum); found=0 } + { print } + ' "$CMAKE_FILE" >"${CMAKE_FILE}.tmp" && mv "${CMAKE_FILE}.tmp" "$CMAKE_FILE" +} + +if [[ $MODE == "--pr" ]]; then + echo "Fetching cuDF PR #${ARG}..." + PR_INFO=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/pulls/${ARG}") + SHA=$(echo "$PR_INFO" | jq -r '.head.sha') + BASE=$(echo "$PR_INFO" | jq -r '.base.ref') + VERSION=$(get_version "$BASE") + DATE=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/commits/${SHA}" | jq -r '.commit.committer.date[:10]') + + echo " Base: $BASE (version $VERSION)" + echo " Commit: ${SHA:0:7} from $DATE" + echo " Computing SHA256..." + CHECKSUM=$(get_sha256 "cudf" "$SHA") + echo " SHA256: $CHECKSUM" + echo + + update_dependency "cudf" "$SHA" "$DATE" "$CHECKSUM" + echo "Done! Updated cudf to PR #${ARG}: ${SHA:0:7} ($DATE)" + +elif [[ $MODE == "--commit" ]]; then + echo "Fetching cuDF commit ${ARG:0:7}..." + COMMIT_INFO=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/commits/${ARG}") + SHA=$(echo "$COMMIT_INFO" | jq -r '.sha') + DATE=$(echo "$COMMIT_INFO" | jq -r '.commit.committer.date[:10]') + TIMESTAMP=$(echo "$COMMIT_INFO" | jq -r '.commit.committer.date') + VERSION=$(curl -sf "https://raw.githubusercontent.com/rapidsai/cudf/${SHA}/VERSION" | grep -oP '^[0-9]+\.[0-9]+') + + echo " Commit: ${SHA:0:7} from $DATE" + echo " Version: $VERSION" + echo + + declare -A COMMITS DATES CHECKSUMS + COMMITS[cudf]=$SHA + DATES[cudf]=$DATE + + echo "Finding compatible dependency versions (main branch commits before $TIMESTAMP)..." + echo + + for dep in rapids_cmake rmm kvikio; do + repo=${dep//_/-} + echo "Fetching $repo..." + read -r commit date < <(get_commit_before_date "$repo" "$TIMESTAMP") + echo " Commit: ${commit:0:7} from $date" + echo " Computing SHA256..." + checksum=$(get_sha256 "$repo" "$commit") + echo " SHA256: $checksum" + + COMMITS[$dep]=$commit + DATES[$dep]=$date + CHECKSUMS[$dep]=$checksum + echo + done + + echo "Computing SHA256 for cudf..." + CHECKSUMS[cudf]=$(get_sha256 "cudf" "$SHA") + echo " SHA256: ${CHECKSUMS[cudf]}" + echo + + echo "Updating $CMAKE_FILE..." + for dep in rapids_cmake rmm kvikio cudf; do + update_dependency "$dep" "${COMMITS[$dep]}" "${DATES[$dep]}" "${CHECKSUMS[$dep]}" + done + + echo "Done! Updated dependencies:" + for dep in rapids_cmake rmm kvikio cudf; do + echo " $dep: ${COMMITS[$dep]:0:7} (${DATES[$dep]})" + done + +elif [[ $MODE == "--branch" ]]; then + VERSION=$(get_version "$ARG") + echo "Updating cuDF dependencies from branch $ARG (version $VERSION)" + echo + + declare -A COMMITS DATES CHECKSUMS + + for dep in rapids_cmake rmm kvikio cudf; do + repo=${dep//_/-} + echo "Fetching $repo..." + + read -r commit date < <(get_commit_info "$repo" "$ARG") + echo " Commit: ${commit:0:7} from $date" + echo " Computing SHA256..." + checksum=$(get_sha256 "$repo" "$commit") + echo " SHA256: $checksum" + + COMMITS[$dep]=$commit + DATES[$dep]=$date + CHECKSUMS[$dep]=$checksum + echo + done + + echo "Updating $CMAKE_FILE..." + for dep in rapids_cmake rmm kvikio cudf; do + update_dependency "$dep" "${COMMITS[$dep]}" "${DATES[$dep]}" "${CHECKSUMS[$dep]}" + done + + echo "Done! Updated dependencies:" + for dep in rapids_cmake rmm kvikio cudf; do + echo " $dep: ${COMMITS[$dep]:0:7} (${DATES[$dep]})" + done +else + usage + exit 1 +fi diff --git a/velox/experimental/cudf/exec/CudfLocalPartition.cpp b/velox/experimental/cudf/exec/CudfLocalPartition.cpp index bce2ee2a120..ab701fc38bf 100644 --- a/velox/experimental/cudf/exec/CudfLocalPartition.cpp +++ b/velox/experimental/cudf/exec/CudfLocalPartition.cpp @@ -169,11 +169,16 @@ void CudfLocalPartition::addInput(RowVectorPtr input) { VELOX_FAIL("Unsupported partition function"); }(); - VELOX_CHECK(partitionOffsets.size() == numPartitions_); + // cuDF partitioning APIs return num_partitions + 1 offsets where: + // - offsets[i] is the starting row index for partition i + // - offsets[num_partitions] is the total row count + VELOX_CHECK(partitionOffsets.size() == numPartitions_ + 1); VELOX_CHECK(partitionOffsets[0] == 0); - // Erase first element since it's always 0 and we don't need it. + // cudf::split expects split points (excluding first 0 and last totalRows). + // Erase first element (always 0) and last element (total row count). partitionOffsets.erase(partitionOffsets.begin()); + partitionOffsets.pop_back(); auto partitionedTables = cudf::split(partitionedTable->view(), partitionOffsets);