From 58477661e3ddc50db44591f0d352729a56a8cf9e Mon Sep 17 00:00:00 2001 From: MithunR Date: Tue, 14 Dec 2021 12:11:31 -0800 Subject: [PATCH 1/3] JNI: Function to copy and set validity from bool column. --- .../main/java/ai/rapids/cudf/ColumnView.java | 19 ++++++++ java/src/main/native/CMakeLists.txt | 1 + java/src/main/native/src/ColumnViewJni.cpp | 27 +++++++++-- java/src/main/native/src/ColumnViewJni.cu | 45 +++++++++++++++++++ java/src/main/native/src/ColumnViewJni.hpp | 32 +++++++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 +++++++++++++++++ 6 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 java/src/main/native/src/ColumnViewJni.cu create mode 100644 java/src/main/native/src/ColumnViewJni.hpp diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 6d0d24baf99..5da5e222653 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -803,6 +803,16 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co return new ColumnVector(bitwiseMergeAndSetValidity(getNativeView(), columnViews, mergeOp.nativeId)); } + /** + * Creates a deep copy of a column while replacing the null mask. The null mask is the + * device_vector equivalent of the boolean column given as argument. + * @param validity bool column whose value is to be used as the null mask. + * @return Deep copy of the column with replaced null mask. + */ + public final ColumnVector copyWithValidity(ColumnView validity) { + return new ColumnVector(copyWithValidity(getNativeView(), validity.getNativeView())); + } + ///////////////////////////////////////////////////////////////////////////// // DATE/TIME ///////////////////////////////////////////////////////////////////////////// @@ -3752,6 +3762,15 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] viewHandles, int nullConfig) throws CudfException; + /** + * Native method to deep copy a column while replacing the null mask. The null mask is the + * device_vector equivalent of the boolean column given as argument. + * @param viewHandle column view of the column that is deep copied. + * @param validityHandle bool column whose value is to be used as the null mask. + * @return Deep copy of the column with replaced null mask. + */ + private static native long copyWithValidity(long viewHandle, long validityHandle) throws CudfException; + /** * Get the number of bytes needed to allocate a validity buffer for the given number of rows. */ diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 0ed2f31bfac..2db37d57cbb 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -225,6 +225,7 @@ add_library( src/CudaJni.cpp src/ColumnVectorJni.cpp src/ColumnViewJni.cpp + src/ColumnViewJni.cu src/CompiledExpression.cpp src/ContiguousTableJni.cpp src/HashJoinJni.cpp diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 02d5dc4569c..7501bf441e6 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -14,8 +14,11 @@ * limitations under the License. */ +#include "ColumnViewJni.hpp" #include +#include + #include #include #include @@ -66,14 +69,11 @@ #include #include #include -#include - -#include "cudf/types.hpp" #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" -#include "jni.h" #include "jni_utils.hpp" +#include "map_lookup.hpp" namespace { @@ -1576,6 +1576,25 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithValidity( + JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) { + JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0); + JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0); + try { + cudf::jni::auto_set_device(env); + + auto const exemplar = *reinterpret_cast(exemplar_handle); + auto const validity = *reinterpret_cast(validity_column_handle); + if (exemplar.size() != validity.size()) { + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, + "Exemplar and validity columns must have the same size", 0); + } + auto deep_copy = cudf::jni::new_column_with_validity(exemplar, validity); + return reinterpret_cast(deep_copy.release()); + } + CATCH_STD(env, 0); +} + //////// // Native cudf::column_view life cycle and metadata access methods. Life cycle methods // should typically only be called from the CudfColumn inner class. diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu new file mode 100644 index 00000000000..140408872d7 --- /dev/null +++ b/java/src/main/native/src/ColumnViewJni.cu @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "ColumnViewJni.hpp" + +namespace cudf::jni { + +std::unique_ptr new_column_with_validity(cudf::column_view const &exemplar, + cudf::column_view const &validity_column) { + CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8, + "Validity column must be of type bool"); + CUDF_EXPECTS(validity_column.size() == exemplar.size(), + "Validity column must be of same size as exemplar column"); + + auto validity_device_view = cudf::column_device_view::create(validity_column); + auto validity_begin = cudf::detail::make_optional_iterator( + *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()}); + auto validity_end = validity_begin + validity_device_view->size(); + auto [null_mask, null_count] = + cudf::detail::valid_if(validity_begin, validity_end, [] __device__(auto optional_bool) { + return optional_bool.value_or(false); + }); + auto deep_copy = std::make_unique(exemplar); + deep_copy->set_null_mask(std::move(null_mask), null_count); + return deep_copy; +} + +} // namespace cudf::jni diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp new file mode 100644 index 00000000000..6990f891584 --- /dev/null +++ b/java/src/main/native/src/ColumnViewJni.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf::jni { + +/** + * @brief Creates a deep copy of the exemplar column, with its validity set to the equivalent + * of the boolean `validity` column's value. + * + * @param exemplar The column to be deep copied. + * @param validity_column bool column whose value is to be used as the validity. + * @return Deep copy of the exemplar, with the replaced validity. + */ +std::unique_ptr new_column_with_validity(cudf::column_view const &exemplar, + cudf::column_view const &validity_column); + +} // namespace cudf::jni diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 7120a40a26a..0c0e3aabfe7 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -5886,4 +5886,44 @@ void testReplaceSameIndexColumnInStruct() { }); assertTrue(e.getMessage().contains("Duplicate mapping found for replacing child index")); } + + @Test + void testCopyWithValidity() { + final Boolean T = true; + final Boolean F = false; + final Integer X = null; + + // Straight-line: Invalidate every other row. + try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, T, F, T); + ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, 8, X, 10); + ColumnVector result = exemplar.copyWithValidity(validity)) { + assertColumnsAreEqual(expected, result); + } + + // Straight-line: Invalidate all Rows. + try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + ColumnVector validity = ColumnVector.fromBoxedBooleans(F, F, F, F, F, F, F, F, F, F); + ColumnVector expected = ColumnVector.fromBoxedInts(X, X, X, X, X, X, X, X, X, X); + ColumnVector result = exemplar.copyWithValidity(validity)) { + assertColumnsAreEqual(expected, result); + } + + // Nulls in the validity column are treated as invalid. + try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, null, F, null); + ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, X, X, X); + ColumnVector result = exemplar.copyWithValidity(validity)) { + assertColumnsAreEqual(expected, result); + } + + // Negative case: Mismatch in row count. + Exception x = assertThrows(IllegalArgumentException.class, () -> { + try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T); + ColumnVector result = exemplar.copyWithValidity(validity)) { + } + }); + assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size")); + } } From 2137266bab1374eca91e985027972d0fc5458f21 Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 15 Dec 2021 12:14:31 -0800 Subject: [PATCH 2/3] Review changes: 1. Renamed to `copyWithBooleanColumnAsValidity()`. 2. Better documentation on null handling in validity column, failure conditions, etc. 3. Removed redundant error checks. --- .../main/java/ai/rapids/cudf/ColumnView.java | 35 ++++++++++++++----- java/src/main/native/src/ColumnViewJni.cpp | 8 ++--- java/src/main/native/src/ColumnViewJni.cu | 7 ++-- java/src/main/native/src/ColumnViewJni.hpp | 12 +++++-- .../java/ai/rapids/cudf/ColumnVectorTest.java | 12 +++---- 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5da5e222653..5153c5c1d2a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -804,13 +804,22 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co } /** - * Creates a deep copy of a column while replacing the null mask. The null mask is the + * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * @param validity bool column whose value is to be used as the null mask. - * @return Deep copy of the column with replaced null mask. + * + * The boolColumn must have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. + * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. + * For all other values (i.e. `false` or `null`), the result column will have nulls. + * + * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, + * then the row value is undefined. + * + * @param boolColumn bool column whose value is to be used as the validity mask. + * @return Deep copy of the column with replaced validity mask. */ - public final ColumnVector copyWithValidity(ColumnView validity) { - return new ColumnVector(copyWithValidity(getNativeView(), validity.getNativeView())); + public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { + return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } ///////////////////////////////////////////////////////////////////////////// @@ -3765,11 +3774,21 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * @param viewHandle column view of the column that is deep copied. - * @param validityHandle bool column whose value is to be used as the null mask. + * + * The boolColumn must have the same number of rows as the exemplar column. + * The result column will have the same number of rows as the exemplar. + * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. + * For all other values (i.e. `false` or `null`), the result column will have nulls. + * + * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, + * then the resultant row value is undefined. + * + * @param exemplarViewHandle column view of the column that is deep copied. + * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. */ - private static native long copyWithValidity(long viewHandle, long validityHandle) throws CudfException; + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + long boolColumnViewHandle) throws CudfException; /** * Get the number of bytes needed to allocate a validity buffer for the given number of rows. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 7501bf441e6..4cd4b070aed 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1576,7 +1576,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithValidity( +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity( JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) { JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0); JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0); @@ -1585,11 +1585,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithValidity( auto const exemplar = *reinterpret_cast(exemplar_handle); auto const validity = *reinterpret_cast(validity_column_handle); - if (exemplar.size() != validity.size()) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, - "Exemplar and validity columns must have the same size", 0); - } - auto deep_copy = cudf::jni::new_column_with_validity(exemplar, validity); + auto deep_copy = cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity); return reinterpret_cast(deep_copy.release()); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu index 140408872d7..68b6984eb5d 100644 --- a/java/src/main/native/src/ColumnViewJni.cu +++ b/java/src/main/native/src/ColumnViewJni.cu @@ -22,12 +22,13 @@ namespace cudf::jni { -std::unique_ptr new_column_with_validity(cudf::column_view const &exemplar, - cudf::column_view const &validity_column) { +std::unique_ptr +new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar, + cudf::column_view const &validity_column) { CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8, "Validity column must be of type bool"); CUDF_EXPECTS(validity_column.size() == exemplar.size(), - "Validity column must be of same size as exemplar column"); + "Exemplar and validity columns must have the same size"); auto validity_device_view = cudf::column_device_view::create(validity_column); auto validity_begin = cudf::detail::make_optional_iterator( diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp index 6990f891584..37e58ecb63a 100644 --- a/java/src/main/native/src/ColumnViewJni.hpp +++ b/java/src/main/native/src/ColumnViewJni.hpp @@ -22,11 +22,17 @@ namespace cudf::jni { * @brief Creates a deep copy of the exemplar column, with its validity set to the equivalent * of the boolean `validity` column's value. * + * The bool_column must have the same number of rows as the exemplar column. + * The result column will have the same number of rows as the exemplar. + * For all indices `i` where the boolean column is `true`, the result column will have a valid value + * at index i. For all other values (i.e. `false` or `null`), the result column will have nulls. + * * @param exemplar The column to be deep copied. - * @param validity_column bool column whose value is to be used as the validity. + * @param bool_column bool column whose value is to be used as the validity. * @return Deep copy of the exemplar, with the replaced validity. */ -std::unique_ptr new_column_with_validity(cudf::column_view const &exemplar, - cudf::column_view const &validity_column); +std::unique_ptr +new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar, + cudf::column_view const &bool_column); } // namespace cudf::jni diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 0c0e3aabfe7..b78183692a3 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -5888,7 +5888,7 @@ void testReplaceSameIndexColumnInStruct() { } @Test - void testCopyWithValidity() { + void testCopyWithBooleanColumnAsValidity() { final Boolean T = true; final Boolean F = false; final Integer X = null; @@ -5897,7 +5897,7 @@ void testCopyWithValidity() { try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, T, F, T); ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, 8, X, 10); - ColumnVector result = exemplar.copyWithValidity(validity)) { + ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) { assertColumnsAreEqual(expected, result); } @@ -5905,7 +5905,7 @@ void testCopyWithValidity() { try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); ColumnVector validity = ColumnVector.fromBoxedBooleans(F, F, F, F, F, F, F, F, F, F); ColumnVector expected = ColumnVector.fromBoxedInts(X, X, X, X, X, X, X, X, X, X); - ColumnVector result = exemplar.copyWithValidity(validity)) { + ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) { assertColumnsAreEqual(expected, result); } @@ -5913,15 +5913,15 @@ void testCopyWithValidity() { try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, null, F, null); ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, X, X, X); - ColumnVector result = exemplar.copyWithValidity(validity)) { + ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) { assertColumnsAreEqual(expected, result); } // Negative case: Mismatch in row count. - Exception x = assertThrows(IllegalArgumentException.class, () -> { + Exception x = assertThrows(CudfException.class, () -> { try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T); - ColumnVector result = exemplar.copyWithValidity(validity)) { + ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) { } }); assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size")); From 020f8789a6fe76b75c30254a15afaeb292533cce Mon Sep 17 00:00:00 2001 From: MithunR Date: Wed, 15 Dec 2021 15:40:37 -0800 Subject: [PATCH 3/3] Review: Discard old null mask before deep copy. --- java/src/main/native/src/ColumnViewJni.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu index 68b6984eb5d..47055ca1611 100644 --- a/java/src/main/native/src/ColumnViewJni.cu +++ b/java/src/main/native/src/ColumnViewJni.cu @@ -38,7 +38,15 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar, cudf::detail::valid_if(validity_begin, validity_end, [] __device__(auto optional_bool) { return optional_bool.value_or(false); }); - auto deep_copy = std::make_unique(exemplar); + auto const exemplar_without_null_mask = cudf::column_view{ + exemplar.type(), + exemplar.size(), + exemplar.head(), + nullptr, + 0, + exemplar.offset(), + std::vector{exemplar.child_begin(), exemplar.child_end()}}; + auto deep_copy = std::make_unique(exemplar_without_null_mask); deep_copy->set_null_mask(std::move(null_mask), null_count); return deep_copy; }