Skip to content
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ set(ARROW_SRCS
array/builder_dict.cc
array/builder_nested.cc
array/builder_primitive.cc
array/builder_union.cc

buffer.cc
compare.cc
Expand Down
24 changes: 21 additions & 3 deletions cpp/src/arrow/array/builder_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(value_builder_->FinishInternal(&items));
}

// If the type has not been specified in the constructor, infer it
// This is the case if the value_builder contains a DenseUnionBuilder
if (!arrow::internal::checked_cast<ListType&>(*type_).value_type()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll ask again: why are you doing this, since it is already done in the constructor? (see above)
Is there a situation where the ListBuilder constructor isn't called?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this is for the case where type_.value_type() has been initialized to nullptr in the constructor (i.e. the type wasn't known when the ListBuilder was constructed so it was nullptr there), so it will be computed in the Finish method. This is used if the type is inferred automatically during Finish. This requires btw that .Finish of the value builder is called first (which it is). Does that make sense?

Copy link
Member

@pitrou pitrou Jan 31, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh.... Do you mean when value_builder_ itself is a type-inferred union builder? If so, can you add a comment? :-) Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added the comment :)

type_ = std::static_pointer_cast<DataType>(
std::make_shared<ListType>(value_builder_->type()));
}
std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_);
Expand Down Expand Up @@ -138,17 +144,29 @@ void StructBuilder::Reset() {
Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type_, length_, {null_bitmap}, null_count_);

(*out)->child_data.resize(children_.size());
std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
for (size_t i = 0; i < children_.size(); ++i) {
if (length_ == 0) {
// Try to make sure the child buffers are initialized
RETURN_NOT_OK(children_[i]->Resize(0));
}
RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i]));
RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
}

// If the type has not been specified in the constructor, infer it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I wonder why this couldn't be made in the StructBuilder constructor, where the fields are already known.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to above, if the types associated to the StructBuilder were nullptr at construction time, they will be computed when .Finish is called.

// This is the case if one of the children contains a DenseUnionBuilder
if (!type_) {
std::vector<std::shared_ptr<Field>> fields;
for (const auto& field_builder : children_) {
fields.push_back(field("", field_builder->type()));
}
type_ = struct_(fields);
}

*out = ArrayData::Make(type_, length_, {null_bitmap}, null_count_);
(*out)->child_data = std::move(child_data);

capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
Expand Down
60 changes: 60 additions & 0 deletions cpp/src/arrow/array/builder_union.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/array/builder_union.h"

#include <utility>

#include "arrow/util/logging.h"

namespace arrow {

DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {}

Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> types;
RETURN_NOT_OK(types_builder_.Finish(&types));
std::shared_ptr<Buffer> offsets;
RETURN_NOT_OK(offsets_builder_.Finish(&offsets));

std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));

std::vector<std::shared_ptr<Field>> fields;
std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
std::vector<uint8_t> type_ids;
for (size_t i = 0; i < children_.size(); ++i) {
std::shared_ptr<ArrayData> data;
RETURN_NOT_OK(children_[i]->FinishInternal(&data));
child_data[i] = data;
fields.push_back(field(field_names_[i], children_[i]->type()));
type_ids.push_back(static_cast<uint8_t>(i));
}

// If the type has not been specified in the constructor, infer it
if (!type_) {
type_ = union_(fields, type_ids, UnionMode::DENSE);
}

*out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_);
(*out)->child_data = std::move(child_data);
return Status::OK();
}

} // namespace arrow
89 changes: 89 additions & 0 deletions cpp/src/arrow/array/builder_union.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/buffer-builder.h"

namespace arrow {

/// \class DenseUnionBuilder
///
/// You need to call AppendChild for each of the children builders you want
/// to use. The function will return an int8_t, which is the type tag
/// associated with that child. You can then call Append with that tag
/// (followed by an append on the child builder) to add elements to
/// the union array.
///
/// You can either specify the type when the UnionBuilder is constructed
/// or let the UnionBuilder infer the type at runtime (by omitting the
/// type argument from the constructor).
///
/// This API is EXPERIMENTAL.
class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
public:
/// Use this constructor to incrementally build the union array along
/// with types, offsets, and null bitmap.
explicit DenseUnionBuilder(MemoryPool* pool,
const std::shared_ptr<DataType>& type = NULLPTR);

Status AppendNull() {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
ARROW_RETURN_NOT_OK(offsets_builder_.Append(0));
return AppendToBitmap(false);
}

/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
/// \param[in] type index of the child the value will be appended
/// \param[in] offset offset of the value in that child
Status Append(int8_t type, int32_t offset) {
ARROW_RETURN_NOT_OK(types_builder_.Append(type));
ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset));
return AppendToBitmap(true);
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

/// \brief Make a new child builder available to the UnionArray
///
/// \param[in] child the child builder
/// \param[in] field_name the name of the field in the union array type
/// if type inference is used
/// \return child index, which is the "type" argument that needs
/// to be passed to the "Append" method to add a new element to
/// the union array.
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& child,
const std::string& field_name = "") {
children_.push_back(child);
field_names_.push_back(field_name);
return static_cast<int8_t>(children_.size() - 1);
}

private:
TypedBufferBuilder<int8_t> types_builder_;
TypedBufferBuilder<int32_t> offsets_builder_;
std::vector<std::string> field_names_;
};

} // namespace arrow
Loading