Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit aab3f75

Browse files
committed
Add non-lazy data import.
Signed-off-by: ienkovich <[email protected]>
1 parent c3734c8 commit aab3f75

File tree

5 files changed

+33
-15
lines changed

5 files changed

+33
-15
lines changed

omniscidb/ArrowStorage/ArrowStorage.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,8 @@ void ArrowStorage::appendArrowTable(std::shared_ptr<arrow::Table> at, int table_
715715
switch (col_arr->type()->id()) {
716716
case arrow::Type::STRING:
717717
// if the dictionary has already been materialized, append indices
718-
if (!config_->storage.enable_lazy_dict_materialization ||
718+
if (config_->storage.enable_non_lazy_data_import ||
719+
!config_->storage.enable_lazy_dict_materialization ||
719720
dict_data->is_materialized) {
720721
col_arr = createDictionaryEncodedColumn(
721722
dict_data->dict()->stringDict.get(), col_arr, col_type);
@@ -730,8 +731,11 @@ void ArrowStorage::appendArrowTable(std::shared_ptr<arrow::Table> at, int table_
730731
}
731732
} else if (col_type->isString()) {
732733
} else {
733-
col_arr = replaceNullValues(
734-
col_arr, col_type, dict_data ? dict_data->dict()->stringDict.get() : nullptr);
734+
col_arr =
735+
replaceNullValues(col_arr,
736+
col_type,
737+
dict_data ? dict_data->dict()->stringDict.get() : nullptr,
738+
config_->storage.enable_non_lazy_data_import);
735739
}
736740

737741
col_data[col_idx] = col_arr;

omniscidb/ArrowStorage/ArrowStorageUtils.cpp

+14-11
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,9 @@ void copyArrayDataReplacingNulls(T* dst, std::shared_ptr<arrow::Array> arr) {
174174

175175
template <typename T>
176176
std::shared_ptr<arrow::ChunkedArray> replaceNullValuesImpl(
177-
std::shared_ptr<arrow::ChunkedArray> arr) {
178-
if (!std::is_same_v<T, bool> && arr->null_count() == 0) {
177+
std::shared_ptr<arrow::ChunkedArray> arr,
178+
bool force_copy) {
179+
if (!force_copy && !std::is_same_v<T, bool> && arr->null_count() == 0) {
179180
// for boolean columns we still need to convert bitmaps to array
180181
return arr;
181182
}
@@ -884,7 +885,9 @@ std::shared_ptr<arrow::ChunkedArray> convertDecimalToInteger(
884885
std::shared_ptr<arrow::ChunkedArray> replaceNullValues(
885886
std::shared_ptr<arrow::ChunkedArray> arr,
886887
const hdk::ir::Type* type,
887-
StringDictionary* dict) {
888+
StringDictionary* dict,
889+
bool force_single_chunk) {
890+
bool force_copy = force_single_chunk && (arr->chunks().size() > 1);
888891
if (type->isTime()) {
889892
if (type->size() != 8) {
890893
throw std::runtime_error("Unsupported time type for Arrow import: "s +
@@ -902,7 +905,7 @@ std::shared_ptr<arrow::ChunkedArray> replaceNullValues(
902905
case 2:
903906
return convertDateReplacingNulls<int32_t, int16_t>(arr);
904907
case 4:
905-
return replaceNullValuesImpl<int32_t>(arr);
908+
return replaceNullValuesImpl<int32_t>(arr, force_copy);
906909
case 8:
907910
return convertDateReplacingNulls<int32_t, int64_t>(arr);
908911
default:
@@ -912,26 +915,26 @@ std::shared_ptr<arrow::ChunkedArray> replaceNullValues(
912915
} else if (type->isInteger() || type->isTimestamp()) {
913916
switch (type->size()) {
914917
case 1:
915-
return replaceNullValuesImpl<int8_t>(arr);
918+
return replaceNullValuesImpl<int8_t>(arr, force_copy);
916919
case 2:
917-
return replaceNullValuesImpl<int16_t>(arr);
920+
return replaceNullValuesImpl<int16_t>(arr, force_copy);
918921
case 4:
919-
return replaceNullValuesImpl<int32_t>(arr);
922+
return replaceNullValuesImpl<int32_t>(arr, force_copy);
920923
case 8:
921-
return replaceNullValuesImpl<int64_t>(arr);
924+
return replaceNullValuesImpl<int64_t>(arr, force_copy);
922925
default:
923926
throw std::runtime_error("Unsupported integer/datetime type for Arrow import: "s +
924927
type->toString());
925928
}
926929
} else if (type->isFloatingPoint()) {
927930
switch (type->as<hdk::ir::FloatingPointType>()->precision()) {
928931
case hdk::ir::FloatingPointType::kFloat:
929-
return replaceNullValuesImpl<float>(arr);
932+
return replaceNullValuesImpl<float>(arr, force_copy);
930933
case hdk::ir::FloatingPointType::kDouble:
931-
return replaceNullValuesImpl<double>(arr);
934+
return replaceNullValuesImpl<double>(arr, force_copy);
932935
}
933936
} else if (type->isBoolean()) {
934-
return replaceNullValuesImpl<bool>(arr);
937+
return replaceNullValuesImpl<bool>(arr, force_copy);
935938
} else if (type->isFixedLenArray()) {
936939
return replaceNullValuesFixedSizeArray(arr, type, dict);
937940
} else if (type->isVarLenArray()) {

omniscidb/ArrowStorage/ArrowStorageUtils.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ std::shared_ptr<arrow::DataType> getArrowImportType(hdk::ir::Context& ctx,
2828
std::shared_ptr<arrow::ChunkedArray> replaceNullValues(
2929
std::shared_ptr<arrow::ChunkedArray> arr,
3030
const hdk::ir::Type* type,
31-
StringDictionary* dict = nullptr);
31+
StringDictionary* dict = nullptr,
32+
bool force_single_chunk = false);
3233

3334
std::shared_ptr<arrow::ChunkedArray> convertDecimalToInteger(
3435
std::shared_ptr<arrow::ChunkedArray> arr,

omniscidb/ConfigBuilder/ConfigBuilder.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,15 @@ bool ConfigBuilder::parseCommandLineArgs(int argc,
588588
->default_value(config_->storage.enable_lazy_dict_materialization)
589589
->implicit_value(true),
590590
"Enable lazy materialization of string dictionary columns from Arrow Storage.");
591+
opt_desc.add_options()(
592+
"enable-non-lazy-data-import",
593+
po::value<bool>(&config_->storage.enable_non_lazy_data_import)
594+
->default_value(config_->storage.enable_non_lazy_data_import)
595+
->implicit_value(true),
596+
"Enable non-lazy data import in Arrow Storage. When enabled, we do as much data "
597+
"processing on import as we might require. This might increase overall execution "
598+
"time. This option can be used to split data import and execution for performance "
599+
"measurements.");
591600

592601
if (allow_gtest_flags) {
593602
opt_desc.add_options()("gtest_list_tests", "list all test");

omniscidb/Shared/Config.h

+1
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ struct DebugConfig {
177177

178178
struct StorageConfig {
179179
bool enable_lazy_dict_materialization = false;
180+
bool enable_non_lazy_data_import = false;
180181
};
181182

182183
struct Config {

0 commit comments

Comments
 (0)