Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp/include/nvtext/wordpiece_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct wordpiece_vocabulary {
* @param input Strings for the vocabulary
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Object to be used with nvtext::tokenize_with_vocabulary
* @return Object to be used with nvtext::wordpiece_tokenize
*/
std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
cudf::strings_column_view const& input,
Expand Down Expand Up @@ -103,6 +103,8 @@ std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
*
* Any null row entry results in a corresponding null entry in the output.
*
* @throw std::invalid_argument If `max_words_per_row` is less than 0.
*
* @param input Strings column to tokenize
* @param vocabulary Used to lookup tokens within `input`
* @param max_words_per_row Maximum number of words to tokenize for each row.
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/text/wordpiece_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,6 @@ __device__ cudf::size_type wp_tokenize_fn(cudf::string_view word,
template <typename MapRefType, typename SubMapRefType>
CUDF_KERNEL void tokenize_all_kernel(cudf::device_span<int64_t const> d_edges,
char const* d_chars,
// int64_t offset,
MapRefType const d_map,
SubMapRefType const d_sub_map,
cudf::size_type unk_id,
Expand All @@ -447,7 +446,7 @@ CUDF_KERNEL void tokenize_all_kernel(cudf::device_span<int64_t const> d_edges,
auto const word_end = thrust::find(thrust::seq, begin, end, ' ');
auto const size = static_cast<cudf::size_type>(cuda::std::distance(begin, word_end));
if (size == 0) { return; }
auto d_output = d_tokens + d_edges[idx]; // - offset;
auto d_output = d_tokens + d_edges[idx];
if (size >= max_word_size) {
*d_output = unk_id;
return;
Expand Down Expand Up @@ -833,7 +832,8 @@ std::unique_ptr<cudf::column> wordpiece_tokenize(cudf::strings_column_view const
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(max_words_per_row >= 0, "Invalid value for max_words_per_row argument");
CUDF_EXPECTS(
max_words_per_row >= 0, "Invalid value for max_words_per_row argument", std::invalid_argument);

auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
if (input.size() == input.null_count()) {
Expand Down
6 changes: 6 additions & 0 deletions cpp/tests/text/subword_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,10 @@ TEST(TextSubwordTest, WordPieceErrors)
auto nulls = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false});
EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(nulls)),
std::invalid_argument);

auto vocabulary = cudf::test::strings_column_wrapper({"x"});
auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
auto input = cudf::test::strings_column_wrapper({" "});
EXPECT_THROW(nvtext::wordpiece_tokenize(cudf::strings_column_view(input), *vocab, -1),
std::invalid_argument);
}