Skip to content

Commit

Permalink
fix: Fix Flatbuffer Parsing in various corner cases
Browse files Browse the repository at this point in the history
* Fix logic issues with detecting presence/absence of feature names and hashes in example data, leading to an access violation crash
* Fix issues with properly parsing ExampleCollection buffers, around reporting doneness and advancing when parsing inner MultiExample objects.
* Fix tests to include coverage for incoming flatbuffers with/without names/hashes
* Fixes validation logic to avoid crashing when feature names/hashes are missing
* Add read_span_flatbuffer tests
  • Loading branch information
lokitoth committed Feb 13, 2024
1 parent 0351a62 commit ca77f15
Show file tree
Hide file tree
Showing 13 changed files with 595 additions and 385 deletions.
4 changes: 3 additions & 1 deletion vowpalwabbit/fb_parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ set(vw_fb_parser_test_sources
tests/prototype_label.h
tests/prototype_namespace.h

tests/flatbuffer_parser_test.cc
tests/affordance_validation_tests.cc
tests/read_span_tests.cc
tests/flatbuffer_parser_tests.cc
)

message(STATUS "vw_fb_parser_test_sources: ${vw_fb_parser_test_sources}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
#pragma once

#include "vw/core/api_status.h"
#include "vw/core/example.h"
#include "vw/core/multi_ex.h"
#include "vw/core/shared_data.h"
#include "vw/core/vw_fwd.h"
#include "vw/fb_parser/generated/example_generated.h"
#include "vw/core/example.h"

namespace VW
{
Expand All @@ -21,7 +21,8 @@ namespace parsers
namespace flatbuffer
{
int flatbuffer_to_examples(VW::workspace* all, io_buf& buf, VW::multi_ex& examples);
bool read_span_flatbuffer(VW::workspace* all, const uint8_t* span, size_t length, example_factory_t example_factory, VW::multi_ex& examples);
bool read_span_flatbuffer(
VW::workspace* all, const uint8_t* span, size_t length, example_factory_t example_factory, VW::multi_ex& examples);

class parser
{
Expand Down
160 changes: 79 additions & 81 deletions vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ int flatbuffer_to_examples(VW::workspace* all, io_buf& buf, VW::multi_ex& exampl
return static_cast<int>(status.get_error_code() == VW::experimental::error_code::success);
}

bool read_span_flatbuffer(VW::workspace* all, const uint8_t* span, size_t length, example_factory_t example_factory, VW::multi_ex& examples)
bool read_span_flatbuffer(
VW::workspace* all, const uint8_t* span, size_t length, example_factory_t example_factory, VW::multi_ex& examples)
{
// we expect context to contain a size_prefixed flatbuffer (technically a binary string)
// which means:
Expand All @@ -69,8 +70,9 @@ bool read_span_flatbuffer(VW::workspace* all, const uint8_t* span, size_t length
return false;
}

uint32_t flatbuffer_object_size = *reinterpret_cast<const uint32_t*>(span);
if (length != flatbuffer_object_size + sizeof(uint32_t))
flatbuffers::uoffset_t flatbuffer_object_size =
flatbuffers::ReadScalar<flatbuffers::uoffset_t>(span); //*reinterpret_cast<const uint32_t*>(span);
if (length != flatbuffer_object_size + sizeof(flatbuffers::uoffset_t))
{
std::stringstream sstream;
sstream << "fb_parser error: flatbuffer size prefix does not match actual size" << std::endl;
Expand All @@ -85,8 +87,7 @@ bool read_span_flatbuffer(VW::workspace* all, const uint8_t* span, size_t length

bool has_more = true;
VW::experimental::api_status status;
do
{
do {
switch (all->parser_runtime.flat_converter->parse_examples(all, unused, temp_ex, span, &status))
{
case VW::experimental::error_code::success:
Expand All @@ -102,7 +103,9 @@ bool read_span_flatbuffer(VW::workspace* all, const uint8_t* span, size_t length
return false;
}

if (has_more &= !temp_ex[0]->is_newline)
has_more &= !temp_ex[0]->is_newline;

if (!temp_ex[0]->is_newline)
{
examples.push_back(&example_factory());
std::swap(examples[examples.size() - 1], temp_ex[0]);
Expand Down Expand Up @@ -196,11 +199,15 @@ int parser::process_collection_item(VW::workspace* all, VW::multi_ex& examples,
_multi_example_object = _data->example_obj_as_ExampleCollection()->multi_examples()->Get(_example_index);
RETURN_IF_FAIL(parse_multi_example(all, examples[0], _multi_example_object, status));
// read from active collection
_example_index++;
if (_example_index == _data->example_obj_as_ExampleCollection()->multi_examples()->size())

if (!_active_multi_ex)
{
_example_index = 0;
_active_collection = false;
_example_index++;
if (_example_index == _data->example_obj_as_ExampleCollection()->multi_examples()->size())
{
_example_index = 0;
_active_collection = false;
}
}
}
else
Expand All @@ -220,8 +227,19 @@ int parser::process_collection_item(VW::workspace* all, VW::multi_ex& examples,
int parser::parse_examples(VW::workspace* all, io_buf& buf, VW::multi_ex& examples, const uint8_t* buffer_pointer,
VW::experimental::api_status* status)
{
if (_active_multi_ex) { RETURN_IF_FAIL(parse_multi_example(all, examples[0], _multi_example_object, status)); }
else if (_active_collection) { RETURN_IF_FAIL(process_collection_item(all, examples, status)); }
#define RETURN_SUCCESS_FINISHED() \
return buffer_pointer ? VW::experimental::error_code::nothing_to_parse : VW::experimental::error_code::success;

if (_active_collection)
{
RETURN_IF_FAIL(process_collection_item(all, examples, status));
if (!_active_collection) RETURN_SUCCESS_FINISHED();
}
else if (_active_multi_ex)
{
RETURN_IF_FAIL(parse_multi_example(all, examples[0], _multi_example_object, status));
if (!_active_multi_ex) RETURN_SUCCESS_FINISHED();
}
else
{
// new object to be read from file
Expand All @@ -233,26 +251,33 @@ int parser::parse_examples(VW::workspace* all, io_buf& buf, VW::multi_ex& exampl
{
const auto example = _data->example_obj_as_Example();
RETURN_IF_FAIL(parse_example(all, examples[0], example, status));
RETURN_SUCCESS_FINISHED();
}
break;
case VW::parsers::flatbuffer::ExampleType_MultiExample:
{
_multi_example_object = _data->example_obj_as_MultiExample();
_active_multi_ex = true;

RETURN_IF_FAIL(parse_multi_example(all, examples[0], _multi_example_object, status));
if (!_active_multi_ex) RETURN_SUCCESS_FINISHED();
}
break;
case VW::parsers::flatbuffer::ExampleType_ExampleCollection:
{
_active_collection = true;

RETURN_IF_FAIL(process_collection_item(all, examples, status));
if (!_active_collection) RETURN_SUCCESS_FINISHED();
}
break;

default:
RETURN_ERROR_LS(status, fb_parser_unknown_example_type) << "Unknown example type";

Check warning on line 276 in vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc

View check run for this annotation

Codecov / codecov/patch

vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc#L276

Added line #L276 was not covered by tests
break;
}
}

return VW::experimental::error_code::success;
}

Expand Down Expand Up @@ -342,8 +367,36 @@ bool get_namespace_hash(VW::workspace* all, const Namespace* ns, uint64_t& hash)
return false;
}

bool features_have_names(const Namespace& ns)
{
return flatbuffers::IsFieldPresent(&ns, Namespace::VT_FEATURE_NAMES) && (ns.feature_names()->size() != 0);
// TODO: It is not clear what the right answer is when feature_values->size is 0
}

bool features_have_hashes(const Namespace& ns)
{
return flatbuffers::IsFieldPresent(&ns, Namespace::VT_FEATURE_HASHES) && (ns.feature_hashes()->size() != 0);
}

bool features_have_values(const Namespace& ns)
{
return flatbuffers::IsFieldPresent(&ns, Namespace::VT_FEATURE_VALUES) && (ns.feature_values()->size() != 0);
}

int parser::parse_namespaces(VW::workspace* all, example* ae, const Namespace* ns, VW::experimental::api_status* status)
{
#define RETURN_NS_PARSER_ERROR(status, error_code) \
if (_active_collection && _active_multi_ex) \
{ \
RETURN_ERROR_LS(status, error_code) << "Unable to parse namespace in collection item with example index " \
<< _example_index << "and multi example index " << _multi_ex_index; \
} \
else if (_active_multi_ex) \
{ \
RETURN_ERROR_LS(status, error_code) << "Unable to parse namespace in multi example index " << _multi_ex_index; \
} \
else { RETURN_ERROR_LS(status, error_code) << "Unable to parse namespace "; }

namespace_index index;
RETURN_IF_FAIL(parser::get_namespace_index(ns, index, status));
uint64_t hash = 0;
Expand All @@ -355,46 +408,24 @@ int parser::parse_namespaces(VW::workspace* all, example* ae, const Namespace* n

if (hash_found) { fs.start_ns_extent(hash); }

if (!flatbuffers::IsFieldPresent(ns, Namespace::VT_FEATURE_VALUES))
{
if (_active_collection && _active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_feature_values_missing)
<< "Unable to parse namespace in collection item with example index " << _example_index
<< "and multi example index " << _multi_ex_index;
}
else if (_active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_feature_values_missing)
<< "Unable to parse namespace in multi example index " << _multi_ex_index;
}
else { RETURN_ERROR_LS(status, fb_parser_feature_values_missing) << "Unable to parse namespace "; }
}
if (!features_have_values(*ns)) { RETURN_NS_PARSER_ERROR(status, fb_parser_feature_values_missing) }

auto feature_value_iter = (ns->feature_values())->begin();
const auto feature_value_iter_end = (ns->feature_values())->end();

bool has_hashes = features_have_hashes(*ns);
bool has_names = features_have_names(*ns);

// assuming the usecase that if feature names would exist, they would exist for all features in the namespace
if (flatbuffers::IsFieldPresent(ns, Namespace::VT_FEATURE_NAMES))
if (has_names)
{
const auto ns_name = ns->name();
auto feature_name_iter = (ns->feature_names())->begin();
if (flatbuffers::IsFieldPresent(ns, Namespace::VT_FEATURE_HASHES))
if (has_hashes)
{
if (ns->feature_hashes()->size() != ns->feature_values()->size())
{
if (_active_collection && _active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values)
<< "Unable to parse namespace in collection item with example index " << _example_index
<< "and multi example index " << _multi_ex_index;
}
else if (_active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values)
<< "Unable to parse namespace in multi example index " << _multi_ex_index;
}
else { RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values) << "Unable to parse namespace "; }
RETURN_NS_PARSER_ERROR(status, fb_parser_size_mismatch_ft_hashes_ft_values)

Check warning on line 428 in vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc

View check run for this annotation

Codecov / codecov/patch

vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc#L428

Added line #L428 was not covered by tests
}

auto feature_hash_iter = (ns->feature_hashes())->begin();
Expand All @@ -413,23 +444,13 @@ int parser::parse_namespaces(VW::workspace* all, example* ae, const Namespace* n
// assuming the usecase that if feature names would exist, they would exist for all features in the namespace
if (ns->feature_names()->size() != ns->feature_values()->size())
{
if (_active_collection && _active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_names_ft_values)
<< "Unable to parse namespace in collection item with example index " << _example_index
<< "and multi example index " << _multi_ex_index;
}
else if (_active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_names_ft_values)
<< "Unable to parse namespace in multi example index " << _multi_ex_index;
}
else { RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_names_ft_values) << "Unable to parse namespace "; }
RETURN_NS_PARSER_ERROR(status, fb_parser_size_mismatch_ft_names_ft_values)

Check warning on line 447 in vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc

View check run for this annotation

Codecov / codecov/patch

vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc#L447

Added line #L447 was not covered by tests
}
for (; feature_value_iter != feature_value_iter_end; ++feature_value_iter, ++feature_name_iter)
{
const uint64_t word_hash =
all->parser_runtime.example_parser->hasher(feature_name_iter->c_str(), feature_name_iter->size(), _c_hash);
all->parser_runtime.example_parser->hasher(feature_name_iter->c_str(), feature_name_iter->size(), _c_hash) &
all->runtime_state.parse_mask;
fs.push_back(*feature_value_iter, word_hash);
if (ns_name != nullptr)
{
Expand All @@ -440,36 +461,13 @@ int parser::parse_namespaces(VW::workspace* all, example* ae, const Namespace* n
}
else
{
if (!flatbuffers::IsFieldPresent(ns, Namespace::VT_FEATURE_HASHES))
{
if (_active_collection && _active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_feature_hashes_names_missing)
<< "Unable to parse namespace in collection item with example index " << _example_index
<< "and multi example index " << _multi_ex_index;
}
else if (_active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_feature_hashes_names_missing)
<< "Unable to parse namespace in multi example index " << _multi_ex_index;
}
else { RETURN_ERROR_LS(status, fb_parser_feature_hashes_names_missing) << "Unable to parse namespace "; }
}
if (!has_hashes) { RETURN_NS_PARSER_ERROR(status, fb_parser_name_hash_missing) }

if (ns->feature_hashes()->size() != ns->feature_values()->size())
{
if (_active_collection && _active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values)
<< "Unable to parse namespace in collection item with example index " << _example_index
<< "and multi example index " << _multi_ex_index;
}
else if (_active_multi_ex)
{
RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values)
<< "Unable to parse namespace in multi example index " << _multi_ex_index;
}
else { RETURN_ERROR_LS(status, fb_parser_size_mismatch_ft_hashes_ft_values) << "Unable to parse namespace "; }
RETURN_NS_PARSER_ERROR(status, fb_parser_size_mismatch_ft_hashes_ft_values)

Check warning on line 468 in vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc

View check run for this annotation

Codecov / codecov/patch

vowpalwabbit/fb_parser/src/parse_example_flatbuffer.cc#L468

Added line #L468 was not covered by tests
}

auto feature_hash_iter = (ns->feature_hashes())->begin();
for (; feature_value_iter != feature_value_iter_end; ++feature_value_iter, ++feature_hash_iter)
{
Expand Down
Loading

0 comments on commit ca77f15

Please sign in to comment.