Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 62 additions & 7 deletions CMake/resolve_dependency_modules/cudf.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,50 @@ include_guard(GLOBAL)
# 3.30.4 is the minimum version required by cudf
cmake_minimum_required(VERSION 3.30.4)

# Add velox_resolve_dependency_url here for rapids-cmake, rmm, and kvikio if a specific version or commit is needed.
# rapids_cmake commit 5ec2245 from 2026-01-26
set(VELOX_rapids_cmake_VERSION 26.04)
set(VELOX_rapids_cmake_COMMIT 5ec22457e58953e0a68f0745ce7a11a896ba62b1)
set(
VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM
bf7d4ed5885f5fe012c42fb0977e1fe1416896479ffd34baa0cf762d3e83dc80
)
set(
VELOX_rapids_cmake_SOURCE_URL
"https://github.com/rapidsai/rapids-cmake/archive/${VELOX_rapids_cmake_COMMIT}.tar.gz"
)
velox_resolve_dependency_url(rapids_cmake)

set(VELOX_cudf_VERSION 26.02 CACHE STRING "cudf version")
# rmm commit e728b29 from 2026-01-26
set(VELOX_rmm_VERSION 26.04)
set(VELOX_rmm_COMMIT e728b2923f748d71aad30294b6926f43cb4c826e)
set(
VELOX_rmm_BUILD_SHA256_CHECKSUM
ec18d881b327514de154af67a33a1288eec7bcd86909f23c9bf2d90511b0cf2f
)
set(VELOX_rmm_SOURCE_URL "https://github.com/rapidsai/rmm/archive/${VELOX_rmm_COMMIT}.tar.gz")
velox_resolve_dependency_url(rmm)

# kvikio commit 0f03349 from 2026-01-26
set(VELOX_kvikio_VERSION 26.04)
set(VELOX_kvikio_COMMIT 0f03349bcaf029a2f582d9915a88d09e355ac691)
set(
VELOX_cudf_BUILD_SHA256_CHECKSUM
96b54c2b33281f58183978429933740869ef384d2687308699a257b05076d4fd
VELOX_kvikio_BUILD_SHA256_CHECKSUM
728868c671e2686b5e9b7b4122d1661475f803c4fb98c0852d7be65c365d7b2d
)
set(
VELOX_cudf_SOURCE_URL
"https://github.com/rapidsai/cudf/archive/3f85f626633ca4202941cc3bf3112bcd319eab8e.tar.gz"
VELOX_kvikio_SOURCE_URL
"https://github.com/rapidsai/kvikio/archive/${VELOX_kvikio_COMMIT}.tar.gz"
)
velox_resolve_dependency_url(kvikio)

# cudf commit 68a0714 from 2026-01-27
set(VELOX_cudf_VERSION 26.04 CACHE STRING "cudf version")
set(VELOX_cudf_COMMIT 68a0714a3701431041cb47bf1163706f597f9f48)
set(
VELOX_cudf_BUILD_SHA256_CHECKSUM
0c723d7fd04eab60336dd4bcce41e225821d13b54cdabe485ec54517f3aa8b15
)
set(VELOX_cudf_SOURCE_URL "https://github.com/rapidsai/cudf/archive/${VELOX_cudf_COMMIT}.tar.gz")
velox_resolve_dependency_url(cudf)

# Use block so we don't leak variables
Expand All @@ -38,7 +70,30 @@ block(SCOPE_FOR VARIABLES)
set(CUDF_BUILD_TESTUTIL OFF)
set(BUILD_SHARED_LIBS ON)

# Add FetchContent_Declare here for rapids-cmake, rmm, and kvikio if a specific version or commit is needed.
FetchContent_Declare(
rapids-cmake
URL ${VELOX_rapids_cmake_SOURCE_URL}
URL_HASH ${VELOX_rapids_cmake_BUILD_SHA256_CHECKSUM}
UPDATE_DISCONNECTED 1
)

FetchContent_Declare(
rmm
URL ${VELOX_rmm_SOURCE_URL}
URL_HASH ${VELOX_rmm_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR
cpp
UPDATE_DISCONNECTED 1
)

FetchContent_Declare(
kvikio
URL ${VELOX_kvikio_SOURCE_URL}
URL_HASH ${VELOX_kvikio_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR
cpp
UPDATE_DISCONNECTED 1
)

FetchContent_Declare(
cudf
Expand Down
192 changes: 192 additions & 0 deletions scripts/update-cudf-deps.sh
Copy link
Copy Markdown
Collaborator Author

@bdice bdice Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is optional -- if Velox maintainers have reservations about including this script, we don't have to include it here. We can move it somewhere else.

Its purpose is to pin a compatible cudf dependency tree (rapids-cmake, rmm, kvikio) based on the desired cuDF branch/PR/commit hash.

Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail

usage() {
cat <<EOF
Usage: $0 --branch <branch>
$0 --pr <pr-number>
$0 --commit <sha>

Options:
--branch <branch> Update all cudf dependencies to latest from branch
--pr <pr-number> Update only cudf from a specific PR
--commit <sha> Update all dependencies using cudf commit and compatible versions

Examples:
$0 --branch main
$0 --branch release/26.02
$0 --pr 12345
$0 --commit abc123def456
EOF
}

[[ $# -eq 0 ]] && usage && exit 1

MODE="$1"
ARG="${2:-}"
[[ -z $ARG ]] && echo "Error: $MODE requires an argument" && usage && exit 1

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CMAKE_FILE="$SCRIPT_DIR/../CMake/resolve_dependency_modules/cudf.cmake"

get_commit_info() {
local repo=$1 branch=$2
curl -sf "https://api.github.com/repos/rapidsai/${repo}/commits/${branch}" |
jq -r '[.sha, .commit.committer.date[:10]] | join(" ")'
}

get_commit_before_date() {
local repo=$1 until_date=$2
curl -sf "https://api.github.com/repos/rapidsai/${repo}/commits?sha=main&until=${until_date}&per_page=1" |
jq -r '.[0] | [.sha, .commit.committer.date[:10]] | join(" ")'
}

get_sha256() {
curl -sL "https://github.com/rapidsai/$1/archive/$2.tar.gz" | sha256sum | cut -d' ' -f1
}

get_version() {
local branch=$1
if [[ $branch =~ ^release/([0-9]+\.[0-9]+)$ ]]; then
echo "${BASH_REMATCH[1]}"
else
curl -sf "https://raw.githubusercontent.com/rapidsai/cudf/${branch}/VERSION" |
grep -oP '^[0-9]+\.[0-9]+'
fi
}

update_dependency() {
local var=$1 commit=$2 date=$3 checksum=$4
sed -i "s/# ${var} commit [a-f0-9]* from [0-9-]*/# ${var} commit ${commit:0:7} from ${date}/" "$CMAKE_FILE"
sed -i "s/set(VELOX_${var}_COMMIT [a-f0-9]*)/set(VELOX_${var}_COMMIT ${commit})/" "$CMAKE_FILE"

if [[ $var == "cudf" ]]; then
sed -i "s/set(VELOX_${var}_VERSION [0-9.]* CACHE/set(VELOX_${var}_VERSION ${VERSION} CACHE/" "$CMAKE_FILE"
else
sed -i "s/set(VELOX_${var}_VERSION [0-9.]*)/set(VELOX_${var}_VERSION ${VERSION})/" "$CMAKE_FILE"
fi

awk -v var="VELOX_${var}_BUILD_SHA256_CHECKSUM" -v sum="$checksum" '
$0 ~ var { found=1 }
found && /^[[:space:]]*[a-f0-9]{64}[[:space:]]*$/ { sub(/[a-f0-9]{64}/, sum); found=0 }
{ print }
' "$CMAKE_FILE" >"${CMAKE_FILE}.tmp" && mv "${CMAKE_FILE}.tmp" "$CMAKE_FILE"
}

if [[ $MODE == "--pr" ]]; then
echo "Fetching cuDF PR #${ARG}..."
PR_INFO=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/pulls/${ARG}")
SHA=$(echo "$PR_INFO" | jq -r '.head.sha')
BASE=$(echo "$PR_INFO" | jq -r '.base.ref')
VERSION=$(get_version "$BASE")
DATE=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/commits/${SHA}" | jq -r '.commit.committer.date[:10]')

echo " Base: $BASE (version $VERSION)"
echo " Commit: ${SHA:0:7} from $DATE"
echo " Computing SHA256..."
CHECKSUM=$(get_sha256 "cudf" "$SHA")
echo " SHA256: $CHECKSUM"
echo

update_dependency "cudf" "$SHA" "$DATE" "$CHECKSUM"
echo "Done! Updated cudf to PR #${ARG}: ${SHA:0:7} ($DATE)"

elif [[ $MODE == "--commit" ]]; then
echo "Fetching cuDF commit ${ARG:0:7}..."
COMMIT_INFO=$(curl -sf "https://api.github.com/repos/rapidsai/cudf/commits/${ARG}")
SHA=$(echo "$COMMIT_INFO" | jq -r '.sha')
DATE=$(echo "$COMMIT_INFO" | jq -r '.commit.committer.date[:10]')
TIMESTAMP=$(echo "$COMMIT_INFO" | jq -r '.commit.committer.date')
VERSION=$(curl -sf "https://raw.githubusercontent.com/rapidsai/cudf/${SHA}/VERSION" | grep -oP '^[0-9]+\.[0-9]+')

echo " Commit: ${SHA:0:7} from $DATE"
echo " Version: $VERSION"
echo

declare -A COMMITS DATES CHECKSUMS
COMMITS[cudf]=$SHA
DATES[cudf]=$DATE

echo "Finding compatible dependency versions (main branch commits before $TIMESTAMP)..."
echo

for dep in rapids_cmake rmm kvikio; do
repo=${dep//_/-}
echo "Fetching $repo..."
read -r commit date < <(get_commit_before_date "$repo" "$TIMESTAMP")
echo " Commit: ${commit:0:7} from $date"
echo " Computing SHA256..."
checksum=$(get_sha256 "$repo" "$commit")
echo " SHA256: $checksum"

COMMITS[$dep]=$commit
DATES[$dep]=$date
CHECKSUMS[$dep]=$checksum
echo
done

echo "Computing SHA256 for cudf..."
CHECKSUMS[cudf]=$(get_sha256 "cudf" "$SHA")
echo " SHA256: ${CHECKSUMS[cudf]}"
echo

echo "Updating $CMAKE_FILE..."
for dep in rapids_cmake rmm kvikio cudf; do
update_dependency "$dep" "${COMMITS[$dep]}" "${DATES[$dep]}" "${CHECKSUMS[$dep]}"
done

echo "Done! Updated dependencies:"
for dep in rapids_cmake rmm kvikio cudf; do
echo " $dep: ${COMMITS[$dep]:0:7} (${DATES[$dep]})"
done

elif [[ $MODE == "--branch" ]]; then
VERSION=$(get_version "$ARG")
echo "Updating cuDF dependencies from branch $ARG (version $VERSION)"
echo

declare -A COMMITS DATES CHECKSUMS

for dep in rapids_cmake rmm kvikio cudf; do
repo=${dep//_/-}
echo "Fetching $repo..."

read -r commit date < <(get_commit_info "$repo" "$ARG")
echo " Commit: ${commit:0:7} from $date"
echo " Computing SHA256..."
checksum=$(get_sha256 "$repo" "$commit")
echo " SHA256: $checksum"

COMMITS[$dep]=$commit
DATES[$dep]=$date
CHECKSUMS[$dep]=$checksum
echo
done

echo "Updating $CMAKE_FILE..."
for dep in rapids_cmake rmm kvikio cudf; do
update_dependency "$dep" "${COMMITS[$dep]}" "${DATES[$dep]}" "${CHECKSUMS[$dep]}"
done

echo "Done! Updated dependencies:"
for dep in rapids_cmake rmm kvikio cudf; do
echo " $dep: ${COMMITS[$dep]:0:7} (${DATES[$dep]})"
done
else
usage
exit 1
fi
9 changes: 7 additions & 2 deletions velox/experimental/cudf/exec/CudfLocalPartition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,16 @@ void CudfLocalPartition::addInput(RowVectorPtr input) {
VELOX_FAIL("Unsupported partition function");
}();

VELOX_CHECK(partitionOffsets.size() == numPartitions_);
// cuDF partitioning APIs return num_partitions + 1 offsets where:
// - offsets[i] is the starting row index for partition i
// - offsets[num_partitions] is the total row count
VELOX_CHECK(partitionOffsets.size() == numPartitions_ + 1);
VELOX_CHECK(partitionOffsets[0] == 0);

// Erase first element since it's always 0 and we don't need it.
// cudf::split expects split points (excluding first 0 and last totalRows).
// Erase first element (always 0) and last element (total row count).
partitionOffsets.erase(partitionOffsets.begin());
partitionOffsets.pop_back();

auto partitionedTables =
cudf::split(partitionedTable->view(), partitionOffsets);
Expand Down
Loading