@@ -750,83 +750,101 @@ std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
750
750
return result;
751
751
}
752
752
753
- std::vector<int32_t > StringDictionary::getEquals (std::string pattern,
754
- std::string comp_operator,
755
- size_t generation) {
756
- CHECK (!base_dict_) << " Not implemented " ;
753
+ std::vector<int32_t > StringDictionary::getEquals (const std::string& pattern,
754
+ const std::string& comp_operator,
755
+ int64_t generation) const {
756
+ mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_) ;
757
757
std::vector<int32_t > result;
758
+ if (base_dict_) {
759
+ result = base_dict_->getEquals (
760
+ pattern, comp_operator, std::min (generation, base_generation_));
761
+ if ((comp_operator == " =" && !result.empty ()) || generation < base_generation_) {
762
+ return result;
763
+ }
764
+ }
765
+
758
766
auto eq_id_itr = equal_cache_.find (pattern);
759
- int32_t eq_id = MAX_STRLEN + 1 ;
760
- int32_t cur_size = str_count_;
767
+ int32_t eq_id = -1 ;
761
768
if (eq_id_itr != equal_cache_.end ()) {
762
769
auto eq_id = eq_id_itr->second ;
763
770
if (comp_operator == " =" ) {
764
- result.push_back (eq_id);
771
+ if (eq_id < generation) {
772
+ result.push_back (eq_id);
773
+ }
765
774
} else {
766
- for (int32_t idx = 0 ; idx <= cur_size; idx ++) {
767
- if (idx = = eq_id) {
768
- continue ;
775
+ for (int32_t id = base_generation_; id < generation; id ++) {
776
+ if (id ! = eq_id) {
777
+ result. push_back (id) ;
769
778
}
770
- result.push_back (idx);
771
779
}
772
780
}
773
781
} else {
774
782
std::vector<std::thread> workers;
775
783
int worker_count = cpu_threads ();
776
784
CHECK_GT (worker_count, 0 );
777
- std::vector<std::vector<int32_t >> worker_results (worker_count);
778
- CHECK_LE (generation, str_count_);
779
785
for (int worker_idx = 0 ; worker_idx < worker_count; ++worker_idx) {
780
786
workers.emplace_back (
781
- [&worker_results , &pattern, generation, worker_idx, worker_count, this ]() {
782
- for (size_t string_id = worker_idx; string_id < generation;
787
+ [&eq_id , &pattern, generation, worker_idx, worker_count, this ]() {
788
+ for (int string_id = indexToId ( worker_idx) ; string_id < generation;
783
789
string_id += worker_count) {
784
790
const auto str = getStringUnlocked (string_id);
785
791
if (str == pattern) {
786
- worker_results[worker_idx].push_back (string_id);
792
+ // Only one thread can find matching string, so no additional sync.
793
+ eq_id = string_id;
794
+ break ;
787
795
}
788
796
}
789
797
});
790
798
}
791
799
for (auto & worker : workers) {
792
800
worker.join ();
793
801
}
794
- for (const auto & worker_result : worker_results) {
795
- result.insert (result.end (), worker_result.begin (), worker_result.end ());
796
- }
797
- if (result.size () > 0 ) {
798
- const auto it_ok = equal_cache_.insert (std::make_pair (pattern, result[0 ]));
802
+ if (eq_id >= 0 ) {
803
+ const auto it_ok = equal_cache_.insert (std::make_pair (pattern, eq_id));
799
804
CHECK (it_ok.second );
800
- eq_id = result[0 ];
801
805
}
802
806
if (comp_operator == " <>" ) {
803
- for (int32_t idx = 0 ; idx <= cur_size; idx ++) {
804
- if (idx = = eq_id) {
805
- continue ;
807
+ for (int32_t id = base_generation_; id < generation; id ++) {
808
+ if (id ! = eq_id) {
809
+ result. push_back (id) ;
806
810
}
807
- result.push_back (idx);
808
811
}
812
+ } else if (eq_id >= 0 && eq_id < generation) {
813
+ result.push_back (eq_id);
809
814
}
810
815
}
811
816
return result;
812
817
}
813
818
814
819
std::vector<int32_t > StringDictionary::getCompare (const std::string& pattern,
815
820
const std::string& comp_operator,
816
- const size_t generation) {
817
- CHECK (!base_dict_) << " Not implemented" ;
821
+ int64_t generation) const {
822
+ generation = generation >= 0 ? std::min (generation, static_cast <int64_t >(entryCount ()))
823
+ : static_cast <int64_t >(entryCount ());
824
+ {
825
+ // The lock is used only to check cache.
826
+ mapd_shared_lock<mapd_shared_mutex> read_lock (rw_mutex_);
827
+ if ((sorted_cache.size () < str_count_) &&
828
+ (comp_operator == " =" || comp_operator == " <>" )) {
829
+ read_lock.unlock ();
830
+ return getEquals (pattern, comp_operator, generation);
831
+ }
832
+ }
833
+
818
834
mapd_lock_guard<mapd_shared_mutex> write_lock (rw_mutex_);
819
835
std::vector<int32_t > ret;
820
- if (str_count_ == 0 ) {
821
- return ret;
822
- }
823
- if (sorted_cache.size () < str_count_) {
824
- if (comp_operator == " =" || comp_operator == " <>" ) {
825
- return getEquals (pattern, comp_operator, generation);
836
+ if (base_dict_) {
837
+ ret = base_dict_->getCompare (
838
+ pattern, comp_operator, std::min (generation, base_generation_));
839
+ if ((comp_operator == " =" && !ret.empty ()) || generation < base_generation_) {
840
+ return ret;
826
841
}
842
+ }
827
843
844
+ if (sorted_cache.size () < str_count_) {
828
845
buildSortedCache ();
829
846
}
847
+
830
848
auto cache_index = compare_cache_.get (pattern);
831
849
832
850
if (!cache_index) {
@@ -868,92 +886,72 @@ std::vector<int32_t> StringDictionary::getCompare(const std::string& pattern,
868
886
// For < operator if the index that we have points to the element which is equal to
869
887
// the pattern that we are searching for we simply get all the elements less than the
870
888
// index. If the element pointed by the index is not equal to the pattern we are
871
- // comparing with we also need to include that index in result vector, except when the
872
- // index points to 0 and the pattern is lesser than the smallest value in the string
873
- // dictionary.
889
+ // comparing with we also need to include that index in result vector.
874
890
875
891
if (comp_operator == " <" ) {
876
892
size_t idx = cache_index->index ;
877
893
if (cache_index->diff ) {
878
894
idx = cache_index->index + 1 ;
879
- if (cache_index->index == 0 && cache_index->diff > 0 ) {
880
- idx = cache_index->index ;
881
- }
882
895
}
883
896
for (size_t i = 0 ; i < idx; i++) {
884
- ret.push_back (sorted_cache[i]);
897
+ if (sorted_cache[i] < generation) {
898
+ ret.push_back (sorted_cache[i]);
899
+ }
885
900
}
886
901
887
- // For <= operator if the index that we have points to the element which is equal to
888
- // the pattern that we are searching for we want to include the element pointed by
889
- // the index in the result set. If the element pointed by the index is not equal to
890
- // the pattern we are comparing with we just want to include all the ids with index
891
- // less than the index that is cached, except when pattern that we are searching for
892
- // is smaller than the smallest string in the dictionary.
893
-
902
+ // For <= operator we want to include the all elements less than the index and
903
+ // the index itself since it cannot be greater than the pattern.
894
904
} else if (comp_operator == " <=" ) {
895
905
size_t idx = cache_index->index + 1 ;
896
- if (cache_index == 0 && cache_index->diff > 0 ) {
897
- idx = cache_index->index ;
898
- }
899
906
for (size_t i = 0 ; i < idx; i++) {
900
- ret.push_back (sorted_cache[i]);
907
+ if (sorted_cache[i] < generation) {
908
+ ret.push_back (sorted_cache[i]);
909
+ }
901
910
}
902
911
903
912
// For > operator we want to get all the elements with index greater than the index
904
- // that we have except, when the pattern we are searching for is lesser than the
905
- // smallest string in the dictionary we also want to include the id of the index
906
- // that we have.
907
-
908
913
} else if (comp_operator == " >" ) {
909
914
size_t idx = cache_index->index + 1 ;
910
- if (cache_index->index == 0 && cache_index->diff > 0 ) {
911
- idx = cache_index->index ;
912
- }
913
915
for (size_t i = idx; i < sorted_cache.size (); i++) {
914
- ret.push_back (sorted_cache[i]);
916
+ if (sorted_cache[i] < generation) {
917
+ ret.push_back (sorted_cache[i]);
918
+ }
915
919
}
916
920
917
- // For >= operator when the indexed element that we have points to element which is
918
- // equal to the pattern we are searching for we want to include that in the result
919
- // vector. If the index that we have does not point to the string which is equal to
920
- // the pattern we are searching we don't want to include that id into the result
921
- // vector except when the index is 0.
922
-
921
+ // For >= operator we want to get all the elements with index greater than the index.
922
+ // We also include the index if it matches the pattern
923
923
} else if (comp_operator == " >=" ) {
924
924
size_t idx = cache_index->index ;
925
925
if (cache_index->diff ) {
926
926
idx = cache_index->index + 1 ;
927
- if (cache_index->index == 0 && cache_index->diff > 0 ) {
928
- idx = cache_index->index ;
929
- }
930
927
}
931
928
for (size_t i = idx; i < sorted_cache.size (); i++) {
932
- ret.push_back (sorted_cache[i]);
929
+ if (sorted_cache[i] < generation) {
930
+ ret.push_back (sorted_cache[i]);
931
+ }
933
932
}
934
933
} else if (comp_operator == " =" ) {
935
934
if (!cache_index->diff ) {
936
- ret.push_back (sorted_cache[cache_index->index ]);
935
+ if (sorted_cache[cache_index->index ] < generation) {
936
+ ret.push_back (sorted_cache[cache_index->index ]);
937
+ }
937
938
}
938
939
939
940
// For <> operator it is simple matter of not including id of string which is equal
940
941
// to pattern we are searching for.
941
942
} else if (comp_operator == " <>" ) {
942
943
if (!cache_index->diff ) {
943
- size_t idx = cache_index->index ;
944
- for (size_t i = 0 ; i < idx; i++) {
945
- ret.push_back (sorted_cache[i]);
946
- }
947
- ++idx;
948
- for (size_t i = idx; i < sorted_cache.size (); i++) {
949
- ret.push_back (sorted_cache[i]);
944
+ int eq_id = sorted_cache[cache_index->index ];
945
+ for (int id = base_generation_; id < generation; ++id) {
946
+ if (id != eq_id) {
947
+ ret.push_back (id);
948
+ }
950
949
}
951
950
} else {
952
- for (size_t i = 0 ; i < sorted_cache. size (); i++ ) {
953
- ret.insert (ret. begin (), sorted_cache. begin (), sorted_cache. end () );
951
+ for (int id = base_generation_; id < generation; ++id ) {
952
+ ret.push_back (id );
954
953
}
955
954
}
956
-
957
955
} else {
958
956
std::runtime_error (" Unsupported string comparison operator" );
959
957
}
@@ -1375,20 +1373,18 @@ void StringDictionary::invalidateInvertedIndex() noexcept {
1375
1373
compare_cache_.invalidateInvertedIndex ();
1376
1374
}
1377
1375
1378
- void StringDictionary::buildSortedCache () {
1379
- CHECK (!base_dict_) << " Not implemented" ;
1376
+ void StringDictionary::buildSortedCache () const {
1380
1377
// This method is not thread-safe.
1381
1378
const auto cur_cache_size = sorted_cache.size ();
1382
1379
std::vector<int32_t > temp_sorted_cache;
1383
1380
for (size_t i = cur_cache_size; i < str_count_; i++) {
1384
- temp_sorted_cache.push_back (i );
1381
+ temp_sorted_cache.push_back (indexToId (i) );
1385
1382
}
1386
1383
sortCache (temp_sorted_cache);
1387
1384
mergeSortedCache (temp_sorted_cache);
1388
1385
}
1389
1386
1390
- void StringDictionary::sortCache (std::vector<int32_t >& cache) {
1391
- CHECK (!base_dict_) << " Not implemented" ;
1387
+ void StringDictionary::sortCache (std::vector<int32_t >& cache) const {
1392
1388
// This method is not thread-safe.
1393
1389
1394
1390
// this boost sort is creating some problems when we use UTF-8 encoded strings.
@@ -1401,8 +1397,7 @@ void StringDictionary::sortCache(std::vector<int32_t>& cache) {
1401
1397
});
1402
1398
}
1403
1399
1404
- void StringDictionary::mergeSortedCache (std::vector<int32_t >& temp_sorted_cache) {
1405
- CHECK (!base_dict_) << " Not implemented" ;
1400
+ void StringDictionary::mergeSortedCache (std::vector<int32_t >& temp_sorted_cache) const {
1406
1401
// this method is not thread safe
1407
1402
std::vector<int32_t > updated_cache (temp_sorted_cache.size () + sorted_cache.size ());
1408
1403
size_t t_idx = 0 , s_idx = 0 , idx = 0 ;
0 commit comments