diff --git a/cpp/include/nvtext/wordpiece_tokenize.hpp b/cpp/include/nvtext/wordpiece_tokenize.hpp index 85c0f1406bd..025cae6bf54 100644 --- a/cpp/include/nvtext/wordpiece_tokenize.hpp +++ b/cpp/include/nvtext/wordpiece_tokenize.hpp @@ -66,7 +66,7 @@ struct wordpiece_vocabulary { * @param input Strings for the vocabulary * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Object to be used with nvtext::tokenize_with_vocabulary + * @return Object to be used with nvtext::wordpiece_tokenize */ std::unique_ptr load_wordpiece_vocabulary( cudf::strings_column_view const& input, @@ -103,6 +103,8 @@ std::unique_ptr load_wordpiece_vocabulary( * * Any null row entry results in a corresponding null entry in the output. * + * @throw std::invalid_argument If `max_words_per_row` is less than 0. + * * @param input Strings column to tokenize * @param vocabulary Used to lookup tokens within `input` * @param max_words_per_row Maximum number of words to tokenize for each row. diff --git a/cpp/src/text/wordpiece_tokenize.cu b/cpp/src/text/wordpiece_tokenize.cu index c21200e5c5b..fb44d671902 100644 --- a/cpp/src/text/wordpiece_tokenize.cu +++ b/cpp/src/text/wordpiece_tokenize.cu @@ -434,7 +434,6 @@ __device__ cudf::size_type wp_tokenize_fn(cudf::string_view word, template CUDF_KERNEL void tokenize_all_kernel(cudf::device_span d_edges, char const* d_chars, - // int64_t offset, MapRefType const d_map, SubMapRefType const d_sub_map, cudf::size_type unk_id, @@ -447,7 +446,7 @@ CUDF_KERNEL void tokenize_all_kernel(cudf::device_span d_edges, auto const word_end = thrust::find(thrust::seq, begin, end, ' '); auto const size = static_cast(cuda::std::distance(begin, word_end)); if (size == 0) { return; } - auto d_output = d_tokens + d_edges[idx]; // - offset; + auto d_output = d_tokens + d_edges[idx]; if (size >= max_word_size) { *d_output = unk_id; return; @@ -833,7 +832,8 @@ std::unique_ptr wordpiece_tokenize(cudf::strings_column_view const rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(max_words_per_row >= 0, "Invalid value for max_words_per_row argument"); + CUDF_EXPECTS( + max_words_per_row >= 0, "Invalid value for max_words_per_row argument", std::invalid_argument); auto const output_type = cudf::data_type{cudf::type_to_id()}; if (input.size() == input.null_count()) { diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index b01042abf61..ca02f17672e 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -170,4 +170,10 @@ TEST(TextSubwordTest, WordPieceErrors) auto nulls = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false}); EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(nulls)), std::invalid_argument); + + auto vocabulary = cudf::test::strings_column_wrapper({"x"}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + auto input = cudf::test::strings_column_wrapper({" "}); + EXPECT_THROW(nvtext::wordpiece_tokenize(cudf::strings_column_view(input), *vocab, -1), + std::invalid_argument); }