|
18 | 18 | #include "IR/Type.h"
|
19 | 19 | #include "Shared/ArrowUtil.h"
|
20 | 20 | #include "Shared/measure.h"
|
21 |
| -#include "Shared/threading.h" |
22 | 21 |
|
23 | 22 | #ifdef __GNUC__
|
24 | 23 | #pragma GCC diagnostic push
|
|
32 | 31 | #include <arrow/util/value_parsing.h>
|
33 | 32 | #include <parquet/api/reader.h>
|
34 | 33 | #include <parquet/arrow/reader.h>
|
| 34 | +#include <tbb/blocked_range.h> |
| 35 | +#include <tbb/parallel_for.h> |
35 | 36 |
|
36 | 37 | #ifdef __GNUC__
|
37 | 38 | #pragma GCC diagnostic pop
|
@@ -682,131 +683,126 @@ void ArrowStorage::appendArrowTable(std::shared_ptr<arrow::Table> at, int table_
|
682 | 683 | }
|
683 | 684 | }
|
684 | 685 |
|
685 |
| - threading::parallel_for( |
686 |
| - threading::blocked_range(0, (int)at->columns().size()), [&](auto range) { |
687 |
| - for (auto col_idx = range.begin(); col_idx != range.end(); col_idx++) { |
688 |
| - auto col_info = getColumnInfo(db_id_, table_id, columnId(col_idx)); |
689 |
| - auto col_type = col_info->type; |
690 |
| - auto col_arr = at->column(col_idx); |
691 |
| - |
692 |
| - // Conversion of empty string to Nulls and further processing handled |
693 |
| - // separately. |
694 |
| - if (!col_type->nullable() && col_arr->null_count() != 0 && |
695 |
| - col_arr->type()->id() != arrow::Type::STRING) { |
696 |
| - throw std::runtime_error("Null values used in non-nullable type: "s + |
697 |
| - col_type->toString()); |
698 |
| - } |
| 686 | + tbb::parallel_for(tbb::blocked_range(0, (int)at->columns().size()), [&](auto range) { |
| 687 | + for (auto col_idx = range.begin(); col_idx != range.end(); col_idx++) { |
| 688 | + auto col_info = getColumnInfo(db_id_, table_id, columnId(col_idx)); |
| 689 | + auto col_type = col_info->type; |
| 690 | + auto col_arr = at->column(col_idx); |
699 | 691 |
|
700 |
| - DictionaryData* dict_data = nullptr; |
701 |
| - auto elem_type = |
702 |
| - col_type->isArray() |
703 |
| - ? dynamic_cast<const hdk::ir::ArrayBaseType*>(col_type)->elemType() |
704 |
| - : col_type; |
705 |
| - if (elem_type->isExtDictionary()) { |
706 |
| - dict_data = dicts_ |
707 |
| - .at(dynamic_cast<const hdk::ir::ExtDictionaryType*>(elem_type) |
708 |
| - ->dictId()) |
709 |
| - .get(); |
710 |
| - } |
| 692 | + // Conversion of empty string to Nulls and further processing handled |
| 693 | + // separately. |
| 694 | + if (!col_type->nullable() && col_arr->null_count() != 0 && |
| 695 | + col_arr->type()->id() != arrow::Type::STRING) { |
| 696 | + throw std::runtime_error("Null values used in non-nullable type: "s + |
| 697 | + col_type->toString()); |
| 698 | + } |
711 | 699 |
|
712 |
| - if (col_type->isDecimal()) { |
713 |
| - col_arr = convertDecimalToInteger(col_arr, col_type); |
714 |
| - } else if (col_type->isExtDictionary()) { |
715 |
| - switch (col_arr->type()->id()) { |
716 |
| - case arrow::Type::STRING: |
717 |
| - // if the dictionary has already been materialized, append indices |
718 |
| - if (!config_->storage.enable_lazy_dict_materialization || |
719 |
| - dict_data->is_materialized) { |
720 |
| - col_arr = createDictionaryEncodedColumn( |
721 |
| - dict_data->dict()->stringDict.get(), col_arr, col_type); |
722 |
| - } |
723 |
| - break; |
724 |
| - case arrow::Type::DICTIONARY: |
725 |
| - col_arr = convertArrowDictionary( |
726 |
| - dict_data->dict()->stringDict.get(), col_arr, col_type); |
727 |
| - break; |
728 |
| - default: |
729 |
| - CHECK(false); |
| 700 | + DictionaryData* dict_data = nullptr; |
| 701 | + auto elem_type = |
| 702 | + col_type->isArray() |
| 703 | + ? dynamic_cast<const hdk::ir::ArrayBaseType*>(col_type)->elemType() |
| 704 | + : col_type; |
| 705 | + if (elem_type->isExtDictionary()) { |
| 706 | + dict_data = |
| 707 | + dicts_ |
| 708 | + .at(dynamic_cast<const hdk::ir::ExtDictionaryType*>(elem_type)->dictId()) |
| 709 | + .get(); |
| 710 | + } |
| 711 | + |
| 712 | + if (col_type->isDecimal()) { |
| 713 | + col_arr = convertDecimalToInteger(col_arr, col_type); |
| 714 | + } else if (col_type->isExtDictionary()) { |
| 715 | + switch (col_arr->type()->id()) { |
| 716 | + case arrow::Type::STRING: |
| 717 | + // if the dictionary has already been materialized, append indices |
| 718 | + if (!config_->storage.enable_lazy_dict_materialization || |
| 719 | + dict_data->is_materialized) { |
| 720 | + col_arr = createDictionaryEncodedColumn( |
| 721 | + dict_data->dict()->stringDict.get(), col_arr, col_type); |
730 | 722 | }
|
731 |
| - } else if (col_type->isString()) { |
732 |
| - } else { |
733 |
| - col_arr = replaceNullValues( |
734 |
| - col_arr, |
735 |
| - col_type, |
736 |
| - dict_data ? dict_data->dict()->stringDict.get() : nullptr); |
737 |
| - } |
| 723 | + break; |
| 724 | + case arrow::Type::DICTIONARY: |
| 725 | + col_arr = convertArrowDictionary( |
| 726 | + dict_data->dict()->stringDict.get(), col_arr, col_type); |
| 727 | + break; |
| 728 | + default: |
| 729 | + CHECK(false); |
| 730 | + } |
| 731 | + } else if (col_type->isString()) { |
| 732 | + } else { |
| 733 | + col_arr = replaceNullValues( |
| 734 | + col_arr, col_type, dict_data ? dict_data->dict()->stringDict.get() : nullptr); |
| 735 | + } |
738 | 736 |
|
739 |
| - col_data[col_idx] = col_arr; |
| 737 | + col_data[col_idx] = col_arr; |
740 | 738 |
|
741 |
| - bool compute_stats = !col_type->isString(); |
742 |
| - if (compute_stats) { |
743 |
| - size_t elems_count = 1; |
744 |
| - if (col_type->isFixedLenArray()) { |
745 |
| - elems_count = col_type->size() / elem_type->size(); |
746 |
| - } |
747 |
| - // Compute stats for each fragment. |
748 |
| - threading::parallel_for( |
749 |
| - threading::blocked_range(size_t(0), frag_count), [&](auto frag_range) { |
750 |
| - for (size_t frag_idx = frag_range.begin(); frag_idx != frag_range.end(); |
751 |
| - ++frag_idx) { |
752 |
| - auto& frag = fragments[frag_idx]; |
753 |
| - |
754 |
| - frag.offset = |
755 |
| - frag_idx |
756 |
| - ? ((frag_idx - 1) * table.fragment_size + first_frag_size) |
757 |
| - : 0; |
758 |
| - frag.row_count = |
759 |
| - frag_idx |
760 |
| - ? std::min(table.fragment_size, |
761 |
| - static_cast<size_t>(at->num_rows()) - frag.offset) |
762 |
| - : first_frag_size; |
763 |
| - |
764 |
| - size_t num_bytes; |
765 |
| - if (col_type->isFixedLenArray()) { |
766 |
| - num_bytes = frag.row_count * col_type->size(); |
767 |
| - } else if (col_type->isVarLenArray()) { |
768 |
| - num_bytes = |
769 |
| - computeTotalStringsLength(col_arr, frag.offset, frag.row_count); |
770 |
| - } else { |
771 |
| - num_bytes = frag.row_count * col_type->size(); |
772 |
| - } |
773 |
| - auto meta = std::make_shared<ChunkMetadata>( |
774 |
| - col_info->type, num_bytes, frag.row_count); |
775 |
| - |
776 |
| - if (!lazy_fetch_cols[col_idx]) { |
777 |
| - meta->fillChunkStats(computeStats( |
778 |
| - col_arr->Slice(frag.offset, frag.row_count * elems_count), |
779 |
| - col_type)); |
780 |
| - } else { |
781 |
| - int32_t min = 0; |
782 |
| - int32_t max = -1; |
783 |
| - meta->fillChunkStats(min, max, /*has_nulls=*/true); |
784 |
| - } |
785 |
| - frag.metadata[col_idx] = meta; |
786 |
| - } |
787 |
| - }); // each fragment |
788 |
| - } else { |
789 |
| - for (size_t frag_idx = 0; frag_idx < frag_count; ++frag_idx) { |
790 |
| - auto& frag = fragments[frag_idx]; |
791 |
| - frag.offset = |
792 |
| - frag_idx ? ((frag_idx - 1) * table.fragment_size + first_frag_size) : 0; |
793 |
| - frag.row_count = |
794 |
| - frag_idx ? std::min(table.fragment_size, |
795 |
| - static_cast<size_t>(at->num_rows()) - frag.offset) |
796 |
| - : first_frag_size; |
797 |
| - CHECK(col_type->isText()); |
798 |
| - auto meta = std::make_shared<ChunkMetadata>( |
799 |
| - col_info->type, |
800 |
| - computeTotalStringsLength(col_arr, frag.offset, frag.row_count), |
801 |
| - frag.row_count); |
802 |
| - meta->fillStringChunkStats( |
803 |
| - col_arr->Slice(frag.offset, frag.row_count)->null_count()); |
804 |
| - |
805 |
| - frag.metadata[col_idx] = meta; |
806 |
| - } |
807 |
| - } |
| 739 | + bool compute_stats = !col_type->isString(); |
| 740 | + if (compute_stats) { |
| 741 | + size_t elems_count = 1; |
| 742 | + if (col_type->isFixedLenArray()) { |
| 743 | + elems_count = col_type->size() / elem_type->size(); |
808 | 744 | }
|
809 |
| - }); // each column |
| 745 | + // Compute stats for each fragment. |
| 746 | + tbb::parallel_for( |
| 747 | + tbb::blocked_range(size_t(0), frag_count), [&](auto frag_range) { |
| 748 | + for (size_t frag_idx = frag_range.begin(); frag_idx != frag_range.end(); |
| 749 | + ++frag_idx) { |
| 750 | + auto& frag = fragments[frag_idx]; |
| 751 | + |
| 752 | + frag.offset = |
| 753 | + frag_idx ? ((frag_idx - 1) * table.fragment_size + first_frag_size) |
| 754 | + : 0; |
| 755 | + frag.row_count = |
| 756 | + frag_idx ? std::min(table.fragment_size, |
| 757 | + static_cast<size_t>(at->num_rows()) - frag.offset) |
| 758 | + : first_frag_size; |
| 759 | + |
| 760 | + size_t num_bytes; |
| 761 | + if (col_type->isFixedLenArray()) { |
| 762 | + num_bytes = frag.row_count * col_type->size(); |
| 763 | + } else if (col_type->isVarLenArray()) { |
| 764 | + num_bytes = |
| 765 | + computeTotalStringsLength(col_arr, frag.offset, frag.row_count); |
| 766 | + } else { |
| 767 | + num_bytes = frag.row_count * col_type->size(); |
| 768 | + } |
| 769 | + auto meta = std::make_shared<ChunkMetadata>( |
| 770 | + col_info->type, num_bytes, frag.row_count); |
| 771 | + |
| 772 | + if (!lazy_fetch_cols[col_idx]) { |
| 773 | + meta->fillChunkStats(computeStats( |
| 774 | + col_arr->Slice(frag.offset, frag.row_count * elems_count), |
| 775 | + col_type)); |
| 776 | + } else { |
| 777 | + int32_t min = 0; |
| 778 | + int32_t max = -1; |
| 779 | + meta->fillChunkStats(min, max, /*has_nulls=*/true); |
| 780 | + } |
| 781 | + frag.metadata[col_idx] = meta; |
| 782 | + } |
| 783 | + }); // each fragment |
| 784 | + } else { |
| 785 | + for (size_t frag_idx = 0; frag_idx < frag_count; ++frag_idx) { |
| 786 | + auto& frag = fragments[frag_idx]; |
| 787 | + frag.offset = |
| 788 | + frag_idx ? ((frag_idx - 1) * table.fragment_size + first_frag_size) : 0; |
| 789 | + frag.row_count = |
| 790 | + frag_idx ? std::min(table.fragment_size, |
| 791 | + static_cast<size_t>(at->num_rows()) - frag.offset) |
| 792 | + : first_frag_size; |
| 793 | + CHECK(col_type->isText()); |
| 794 | + auto meta = std::make_shared<ChunkMetadata>( |
| 795 | + col_info->type, |
| 796 | + computeTotalStringsLength(col_arr, frag.offset, frag.row_count), |
| 797 | + frag.row_count); |
| 798 | + meta->fillStringChunkStats( |
| 799 | + col_arr->Slice(frag.offset, frag.row_count)->null_count()); |
| 800 | + |
| 801 | + frag.metadata[col_idx] = meta; |
| 802 | + } |
| 803 | + } |
| 804 | + } |
| 805 | + }); // each column |
810 | 806 | dict_lock.unlock();
|
811 | 807 |
|
812 | 808 | if (table.row_count) {
|
|
0 commit comments