Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 4 additions & 23 deletions c_glib/arrow-dataset-glib/scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ gad_scan_options_class_init(GADScanOptionsClass *klass)
gobject_class->set_property = gad_scan_options_set_property;
gobject_class->get_property = gad_scan_options_get_property;

auto scan_options = arrow::dataset::ScanOptions::Make(arrow::schema({}));
auto scan_options = std::make_shared<arrow::dataset::ScanOptions>();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I will create a new pull-request to remove the bindings of these internal classes.


spec = g_param_spec_pointer("scan-options",
"ScanOptions",
Expand Down Expand Up @@ -307,7 +307,8 @@ GADScanOptions *
gad_scan_options_new(GArrowSchema *schema)
{
auto arrow_schema = garrow_schema_get_raw(schema);
auto arrow_scan_options = arrow::dataset::ScanOptions::Make(arrow_schema);
auto arrow_scan_options = std::make_shared<arrow::dataset::ScanOptions>();
arrow_scan_options->dataset_schema = arrow_schema;
return gad_scan_options_new_raw(&arrow_scan_options);
}

Expand All @@ -323,30 +324,10 @@ GArrowSchema *
gad_scan_options_get_schema(GADScanOptions *scan_options)
{
auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options);
auto arrow_schema = priv->scan_options->schema();
auto arrow_schema = priv->scan_options->dataset_schema;
return garrow_schema_new_raw(&arrow_schema);
}

/**
* gad_scan_options_replace_schema:
* @scan_options: A #GADScanOptions.
* @schema: A #GArrowSchema.
*
* Returns: (transfer full):
* A copy of the #GADScanOptions with the given #GArrowSchema.
*
* Since: 1.0.0
*/
GADScanOptions *
gad_scan_options_replace_schema(GADScanOptions *scan_options,
GArrowSchema *schema)
{
auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options);
auto arrow_schema = garrow_schema_get_raw(schema);
auto arrow_scan_options_copy = priv->scan_options->ReplaceSchema(arrow_schema);
return gad_scan_options_new_raw(&arrow_scan_options_copy);
}

/* arrow::dataset::ScanTask */

typedef struct GADScanTaskPrivate_ {
Expand Down
3 changes: 0 additions & 3 deletions c_glib/arrow-dataset-glib/scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ GARROW_AVAILABLE_IN_1_0
GADScanOptions *gad_scan_options_new(GArrowSchema *schema);
GARROW_AVAILABLE_IN_1_0
GArrowSchema *gad_scan_options_get_schema(GADScanOptions *scan_options);
GARROW_AVAILABLE_IN_1_0
GADScanOptions *gad_scan_options_replace_schema(GADScanOptions *scan_options,
GArrowSchema *schema);

/* arrow::dataset::ScanTask */

Expand Down
7 changes: 0 additions & 7 deletions c_glib/test/dataset/test-scan-options.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,4 @@ def test_batch_size
assert_equal(42,
@scan_options.batch_size)
end

def test_replace_schema
other_schema = Arrow::Schema.new([Arrow::Field.new("visible", Arrow::BooleanDataType.new)])
other_scan_options = @scan_options.replace_schema(other_schema)
assert_not_equal(@schema, other_scan_options.schema)
assert_equal(other_schema, other_scan_options.schema)
end
end
17 changes: 16 additions & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,25 @@ struct CompareOptions : public FunctionOptions {
};

struct ARROW_EXPORT ProjectOptions : public FunctionOptions {
explicit ProjectOptions(std::vector<std::string> n) : field_names(std::move(n)) {}
ProjectOptions(std::vector<std::string> n, std::vector<bool> r,
std::vector<std::shared_ptr<const KeyValueMetadata>> m)
: field_names(std::move(n)),
field_nullability(std::move(r)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nowhere outside of tests do you specify nullability/metadata. If a customer wanted a field to have a custom metadata couldn't they just add it on themselves after the fact? I suppose you could extend my argument to the field names as well but even SQL allows as XYZ.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently I only specify nullability and metadata for passthru projected fields https://github.com/bkietz/arrow/blob/1b95d70e4096704c702d1464e93bd218c30498d4/cpp/src/arrow/dataset/scanner_internal.h#L169-L173

For other fields (for example the field resulting from an arithmetic operation) users will need to attach any desired metadata after the scan. I require specifying field names for all projections since the default field name for a column like SELECT (total - amt) / total isn't clear to me.

field_metadata(std::move(m)) {}

explicit ProjectOptions(std::vector<std::string> n)
: field_names(std::move(n)),
field_nullability(field_names.size(), true),
field_metadata(field_names.size(), NULLPTR) {}

/// Names for wrapped columns
std::vector<std::string> field_names;

/// Nullability bits for wrapped columns
std::vector<bool> field_nullability;

/// Metadata attached to wrapped columns
std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to store a single vector, e.g.:

struct ProjectField {
  std::string name;
  bool nullable;
  std::shared_ptr<const KeyValueMetadata> metadata;

  ProjectField(std::string name, bool nullable=false,
      std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR)
    : name(std::move(name)), nullable(nullable), metadata(std::move(metadata)) {}
};

std::vector<ProjectField> fields_;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine with me; I was just following a perceived convention https://github.com/bkietz/arrow/blob/1b95d70e4096704c702d1464e93bd218c30498d4/cpp/src/arrow/array/array_nested.h#L331

I could also just use a Field with a null type

};

/// @}
Expand Down
24 changes: 20 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,15 @@ const FunctionDoc list_value_length_doc{
Result<ValueDescr> ProjectResolve(KernelContext* ctx,
const std::vector<ValueDescr>& descrs) {
const auto& names = OptionsWrapper<ProjectOptions>::Get(ctx).field_names;
if (names.size() != descrs.size()) {
return Status::Invalid("project() was passed ", names.size(), " field ", "names but ",
descrs.size(), " arguments");
const auto& nullable = OptionsWrapper<ProjectOptions>::Get(ctx).field_nullability;
const auto& metadata = OptionsWrapper<ProjectOptions>::Get(ctx).field_metadata;

if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
metadata.size() != descrs.size()) {
return Status::Invalid("project() was passed ", descrs.size(), " arguments but ",
names.size(), " field names, ", nullable.size(),
" nullability bits, and ", metadata.size(),
" metadata dictionaries.");
}

size_t i = 0;
Expand All @@ -86,7 +92,7 @@ Result<ValueDescr> ProjectResolve(KernelContext* ctx,
}
}

fields[i] = field(names[i], descr.type);
fields[i] = field(names[i], descr.type, nullable[i], metadata[i]);
++i;
}

Expand All @@ -96,6 +102,16 @@ Result<ValueDescr> ProjectResolve(KernelContext* ctx,
void ProjectExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
KERNEL_ASSIGN_OR_RAISE(auto descr, ctx, ProjectResolve(ctx, batch.GetDescriptors()));

for (int i = 0; i < batch.num_values(); ++i) {
const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
if (batch[i].null_count() > 0 && !field->nullable()) {
ctx->SetStatus(Status::Invalid("Output field ", field, " (#", i,
") does not allow nulls but the corresponding "
"argument was not entirely valid."));
return;
}
}

if (descr.shape == ValueDescr::SCALAR) {
ScalarVector scalars(batch.num_values());
for (int i = 0; i < batch.num_values(); ++i) {
Expand Down
88 changes: 53 additions & 35 deletions cpp/src/arrow/compute/kernels/scalar_nested_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/key_value_metadata.h"

namespace arrow {
namespace compute {
Expand All @@ -38,51 +39,71 @@ TEST(TestScalarNested, ListValueLength) {
}

struct {
public:
Result<Datum> operator()(std::vector<Datum> args) {
ProjectOptions opts{field_names};
template <typename... Options>
Result<Datum> operator()(std::vector<Datum> args, std::vector<std::string> field_names,
Options... options) {
ProjectOptions opts{field_names, options...};
return CallFunction("project", args, &opts);
}

std::vector<std::string> field_names;
} Project;

TEST(Project, Scalar) {
std::shared_ptr<StructScalar> expected(new StructScalar{{}, struct_({})});
ASSERT_OK_AND_EQ(Datum(expected), Project({}));

auto i32 = MakeScalar(1);
auto f64 = MakeScalar(2.5);
auto str = MakeScalar("yo");

expected.reset(new StructScalar{
{i32, f64, str},
struct_({field("i", i32->type), field("f", f64->type), field("s", str->type)})});
Project.field_names = {"i", "f", "s"};
ASSERT_OK_AND_EQ(Datum(expected), Project({i32, f64, str}));
ASSERT_OK_AND_ASSIGN(auto expected,
StructScalar::Make({i32, f64, str}, {"i", "f", "s"}));
ASSERT_OK_AND_EQ(Datum(expected), Project({i32, f64, str}, {"i", "f", "s"}));

// Three field names but one input value
ASSERT_RAISES(Invalid, Project({str}));
ASSERT_RAISES(Invalid, Project({str}, {"i", "f", "s"}));

// No field names or input values is fine
expected.reset(new StructScalar{{}, struct_({})});
ASSERT_OK_AND_EQ(Datum(expected), Project(/*args=*/{}, /*field_names=*/{}));
}

TEST(Project, Array) {
Project.field_names = {"i", "s"};
std::vector<std::string> field_names{"i", "s"};

auto i32 = ArrayFromJSON(int32(), "[42, 13, 7]");
auto str = ArrayFromJSON(utf8(), R"(["aa", "aa", "aa"])");
ASSERT_OK_AND_ASSIGN(Datum expected,
StructArray::Make({i32, str}, Project.field_names));
ASSERT_OK_AND_ASSIGN(Datum expected, StructArray::Make({i32, str}, field_names));

ASSERT_OK_AND_EQ(expected, Project({i32, str}));
ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names));

// Scalars are broadcast to the length of the arrays
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}));
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names));

// Array length mismatch
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}));
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names));
}

TEST(Project, NullableMetadataPassedThru) {
auto i32 = ArrayFromJSON(int32(), "[42, 13, 7]");
auto str = ArrayFromJSON(utf8(), R"(["aa", "aa", "aa"])");

std::vector<std::string> field_names{"i", "s"};
std::vector<bool> nullability{true, false};
std::vector<std::shared_ptr<const KeyValueMetadata>> metadata = {
key_value_metadata({"a", "b"}, {"ALPHA", "BRAVO"}), nullptr};

ASSERT_OK_AND_ASSIGN(auto proj,
Project({i32, str}, field_names, nullability, metadata));

AssertTypeEqual(*proj.type(), StructType({
field("i", int32(), /*nullable=*/true, metadata[0]),
field("s", utf8(), /*nullable=*/false, nullptr),
}));

// error: projecting an array containing nulls with nullable=false
str = ArrayFromJSON(utf8(), R"(["aa", null, "aa"])");
ASSERT_RAISES(Invalid, Project({i32, str}, field_names, nullability, metadata));
}

TEST(Project, ChunkedArray) {
Project.field_names = {"i", "s"};
std::vector<std::string> field_names{"i", "s"};

auto i32_0 = ArrayFromJSON(int32(), "[42, 13, 7]");
auto i32_1 = ArrayFromJSON(int32(), "[]");
Expand All @@ -95,26 +116,23 @@ TEST(Project, ChunkedArray) {
ASSERT_OK_AND_ASSIGN(auto i32, ChunkedArray::Make({i32_0, i32_1, i32_2}));
ASSERT_OK_AND_ASSIGN(auto str, ChunkedArray::Make({str_0, str_1, str_2}));

ASSERT_OK_AND_ASSIGN(auto expected_0,
StructArray::Make({i32_0, str_0}, Project.field_names));
ASSERT_OK_AND_ASSIGN(auto expected_1,
StructArray::Make({i32_1, str_1}, Project.field_names));
ASSERT_OK_AND_ASSIGN(auto expected_2,
StructArray::Make({i32_2, str_2}, Project.field_names));
ASSERT_OK_AND_ASSIGN(auto expected_0, StructArray::Make({i32_0, str_0}, field_names));
ASSERT_OK_AND_ASSIGN(auto expected_1, StructArray::Make({i32_1, str_1}, field_names));
ASSERT_OK_AND_ASSIGN(auto expected_2, StructArray::Make({i32_2, str_2}, field_names));
ASSERT_OK_AND_ASSIGN(Datum expected,
ChunkedArray::Make({expected_0, expected_1, expected_2}));

ASSERT_OK_AND_EQ(expected, Project({i32, str}));
ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names));

// Scalars are broadcast to the length of the arrays
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}));
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names));

// Array length mismatch
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}));
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names));
}

TEST(Project, ChunkedArrayDifferentChunking) {
Project.field_names = {"i", "s"};
std::vector<std::string> field_names{"i", "s"};

auto i32_0 = ArrayFromJSON(int32(), "[42, 13, 7]");
auto i32_1 = ArrayFromJSON(int32(), "[]");
Expand All @@ -136,18 +154,18 @@ TEST(Project, ChunkedArrayDifferentChunking) {
for (size_t i = 0; i < expected_chunks.size(); ++i) {
ASSERT_OK_AND_ASSIGN(expected_chunks[i], StructArray::Make({expected_rechunked[0][i],
expected_rechunked[1][i]},
Project.field_names));
field_names));
}

ASSERT_OK_AND_ASSIGN(Datum expected, ChunkedArray::Make(expected_chunks));

ASSERT_OK_AND_EQ(expected, Project({i32, str}));
ASSERT_OK_AND_EQ(expected, Project({i32, str}, field_names));

// Scalars are broadcast to the length of the arrays
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}));
ASSERT_OK_AND_EQ(expected, Project({i32, MakeScalar("aa")}, field_names));

// Array length mismatch
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}));
ASSERT_RAISES(Invalid, Project({i32->Slice(1), str}, field_names));
}

} // namespace compute
Expand Down
Loading