-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-554: [C++] Add functions to unify dictionary types and arrays #3165
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
#include "arrow/util/bit-util.h" | ||
#include "arrow/util/checked_cast.h" | ||
#include "arrow/util/decimal.h" | ||
#include "arrow/util/int-util.h" | ||
#include "arrow/util/logging.h" | ||
#include "arrow/util/macros.h" | ||
#include "arrow/visitor.h" | ||
|
@@ -663,6 +664,66 @@ std::shared_ptr<Array> DictionaryArray::dictionary() const { | |
return dict_type_->dictionary(); | ||
} | ||
|
||
template <typename InType, typename OutType> | ||
static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, | ||
const std::shared_ptr<DataType>& type, | ||
const std::vector<int32_t>& transpose_map, | ||
std::shared_ptr<Array>* out) { | ||
using in_c_type = typename InType::c_type; | ||
using out_c_type = typename OutType::c_type; | ||
|
||
std::shared_ptr<Buffer> out_buffer; | ||
RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); | ||
// Null bitmap is unchanged | ||
auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, | ||
in_data.null_count); | ||
internal::TransposeInts(in_data.GetValues<in_c_type>(1), | ||
out_data->GetMutableValues<out_c_type>(1), in_data.length, | ||
transpose_map.data()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: this is a specific case of the "take" function, a la ndarray.take or "from" verb in J. Here we have out = in TAKE transpose_map There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, though this one has transparent up/downcasting as well (which we probably don't want in the general case of a "take" function). |
||
*out = MakeArray(out_data); | ||
return Status::OK(); | ||
} | ||
|
||
Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type, | ||
const std::vector<int32_t>& transpose_map, | ||
std::shared_ptr<Array>* out) const { | ||
DCHECK_EQ(type->id(), Type::DICTIONARY); | ||
const auto& out_dict_type = checked_cast<const DictionaryType&>(*type); | ||
|
||
// XXX We'll probably want to make this operation a kernel when we | ||
// implement dictionary-to-dictionary casting. | ||
auto in_type_id = dict_type_->index_type()->id(); | ||
auto out_type_id = out_dict_type.index_type()->id(); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think it's worth the (minor) expense of checking for no-op case where Perhaps let's create a follow up JIRA so as to not hold up this patch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is already an XXX for that in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ | ||
case OUT_INDEX_TYPE::type_id: \ | ||
return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \ | ||
transpose_map, out); | ||
|
||
#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ | ||
case IN_INDEX_TYPE::type_id: \ | ||
switch (out_type_id) { \ | ||
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ | ||
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ | ||
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ | ||
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ | ||
default: \ | ||
return Status::NotImplemented("unexpected index type"); \ | ||
} | ||
|
||
switch (in_type_id) { | ||
TRANSPOSE_IN_CASE(Int8Type) | ||
TRANSPOSE_IN_CASE(Int16Type) | ||
TRANSPOSE_IN_CASE(Int32Type) | ||
TRANSPOSE_IN_CASE(Int64Type) | ||
default: | ||
return Status::NotImplemented("unexpected index type"); | ||
} | ||
|
||
#undef TRANSPOSE_IN_OUT_CASE | ||
#undef TRANSPOSE_IN_CASE | ||
} | ||
|
||
// ---------------------------------------------------------------------- | ||
// Implement Array::Accept as inline visitor | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { | |
|
||
value_type Value(int64_t i) const { return raw_values()[i]; } | ||
|
||
// For API compatibility with BinaryArray etc. | ||
value_type GetView(int64_t i) const { return Value(i); } | ||
|
||
protected: | ||
using PrimitiveArray::PrimitiveArray; | ||
}; | ||
|
@@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { | |
i + data_->offset); | ||
} | ||
|
||
bool GetView(int64_t i) const { return Value(i); } | ||
|
||
protected: | ||
using PrimitiveArray::PrimitiveArray; | ||
}; | ||
|
@@ -802,14 +807,31 @@ class ARROW_EXPORT DictionaryArray : public Array { | |
/// This function does the validation of the indices and input type. It checks if | ||
/// all indices are non-negative and smaller than the size of the dictionary | ||
/// | ||
/// \param[in] type a data type containing a dictionary | ||
/// \param[in] type a dictionary type | ||
/// \param[in] indices an array of non-negative signed | ||
/// integers smaller than the size of the dictionary | ||
/// \param[out] out the resulting DictionaryArray instance | ||
static Status FromArrays(const std::shared_ptr<DataType>& type, | ||
const std::shared_ptr<Array>& indices, | ||
std::shared_ptr<Array>* out); | ||
|
||
/// \brief Transpose this DictionaryArray | ||
/// | ||
/// This method constructs a new dictionary array with the given dictionary type, | ||
/// transposing indices using the transpose map. | ||
/// The type and the transpose map are typically computed using | ||
/// DictionaryType::Unify. | ||
/// | ||
/// \param[in] pool a pool to allocate the array data from | ||
/// \param[in] type a dictionary type | ||
/// \param[in] transpose_map a vector transposing this array's indices | ||
/// into the target array's indices | ||
/// \param[out] out the resulting DictionaryArray instance | ||
Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type, | ||
const std::vector<int32_t>& transpose_map, | ||
std::shared_ptr<Array>* out) const; | ||
// XXX Do we also want an unsafe in-place Transpose? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we do want that but how we handle it may require some care. This also came up in the context of Arrow Flight where the dictionaries are unknown at the start of an RPC session https://issues.apache.org/jira/browse/ARROW-3144 I would suggest that we address both cases through a single design, i.e. a One complexity that this introduces (which we need to be prepared for anyway) is the some code will need to check whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I left a comment in ARROW-3144 directing to this discussion |
||
|
||
std::shared_ptr<Array> indices() const; | ||
std::shared_ptr<Array> dictionary() const; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you type them
DictionaryArray
at the declaration to avoid all casts?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, other APIs take
Array
pointers, notDictionaryArray
.