Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit e5948c0

Browse files
committed
Add StringDictionary::buildUnionTranslationMap.
Signed-off-by: ienkovich <[email protected]>
1 parent d642c1c commit e5948c0

7 files changed

+58
-28
lines changed

omniscidb/ResultSet/RowSetMemoryOwner.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,7 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
218218
auto it = str_proxy_union_translation_maps_owned_.find(map_key);
219219
if (it == str_proxy_union_translation_maps_owned_.end()) {
220220
it = str_proxy_union_translation_maps_owned_
221-
.emplace(map_key,
222-
source_proxy->buildUnionTranslationMapToOtherProxy(dest_proxy))
221+
.emplace(map_key, source_proxy->buildUnionTranslationMap(dest_proxy))
223222
.first;
224223
}
225224
return &it->second;

omniscidb/StringDictionary/StringDictionary.cpp

+34-2
Original file line numberDiff line numberDiff line change
@@ -1438,6 +1438,37 @@ std::vector<int32_t> StringDictionary::buildIntersectionTranslationMap(
14381438
return translated_ids;
14391439
}
14401440

1441+
std::vector<int32_t> StringDictionary::buildUnionTranslationMap(
1442+
StringDictionary* dest) const {
1443+
auto dummy_callback = [](const std::string_view& source_string,
1444+
const int32_t source_string_id) { return true; };
1445+
const size_t num_source_strings = entryCount();
1446+
const size_t num_dest_strings = dest->entryCount();
1447+
std::vector<int32_t> translated_ids(num_source_strings);
1448+
auto num_untranslated_strings =
1449+
StringDictionaryTranslator::buildDictionaryTranslationMap(this,
1450+
dest,
1451+
translated_ids.data(),
1452+
num_source_strings,
1453+
num_dest_strings,
1454+
false,
1455+
dummy_callback);
1456+
if (num_untranslated_strings > 0) {
1457+
// Todo (todd): Add call to fetch string_views (local) or strings (distributed)
1458+
// for all non-translated ids to avoid string-by-string fetch
1459+
for (int32_t source_string_id = 0;
1460+
source_string_id < static_cast<int32_t>(translated_ids.size());
1461+
++source_string_id) {
1462+
if (translated_ids[source_string_id] == StringDictionary::INVALID_STR_ID) {
1463+
const auto source_string = getStringUnlocked(source_string_id);
1464+
const auto dest_string_id = dest->getOrAdd(source_string);
1465+
translated_ids[source_string_id] = dest_string_id;
1466+
}
1467+
}
1468+
}
1469+
return translated_ids;
1470+
}
1471+
14411472
} // namespace legacy
14421473

14431474
std::vector<int32_t> StringDictionaryTranslator::buildDictionaryTranslationMap(
@@ -1503,8 +1534,9 @@ size_t StringDictionaryTranslator::buildDictionaryTranslationMap(
15031534
return num_source_strings;
15041535
}
15051536

1537+
size_t base_num_strings_not_translated = 0;
15061538
if (source_dict->base_dict_) {
1507-
buildDictionaryTranslationMap(
1539+
base_num_strings_not_translated = buildDictionaryTranslationMap(
15081540
source_dict->base_dict_.get(),
15091541
dest_dict,
15101542
translated_ids,
@@ -1526,7 +1558,7 @@ size_t StringDictionaryTranslator::buildDictionaryTranslationMap(
15261558
tbb::blocked_range<int32_t>(source_dict->base_generation_,
15271559
num_source_strings,
15281560
target_strings_per_task /* tbb grain_size */),
1529-
(size_t)0,
1561+
base_num_strings_not_translated,
15301562
[&](const tbb::blocked_range<int32_t>& r, size_t num_strings_not_translated) {
15311563
const int32_t start_idx = r.begin();
15321564
const int32_t end_idx = r.end();

omniscidb/StringDictionary/StringDictionary.h

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class StringDictionary {
140140

141141
std::vector<int32_t> buildIntersectionTranslationMap(
142142
const StringDictionary* dest) const;
143+
std::vector<int32_t> buildUnionTranslationMap(StringDictionary* dest) const;
143144

144145
static constexpr int32_t INVALID_STR_ID = -1;
145146
static constexpr size_t MAX_STRLEN = (1 << 15) - 1;

omniscidb/StringDictionary/StringDictionaryProxy.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ std::vector<int32_t> StringDictionaryProxy::buildIntersectionTranslationMap(
292292
num_strings_not_translated);
293293
}
294294

295-
std::vector<int32_t> StringDictionaryProxy::buildUnionTranslationMapToOtherProxy(
295+
std::vector<int32_t> StringDictionaryProxy::buildUnionTranslationMap(
296296
StringDictionaryProxy* dest_proxy) const {
297297
auto timer = DEBUG_TIMER(__func__);
298298

omniscidb/StringDictionary/StringDictionaryProxy.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,7 @@ class StringDictionaryProxy {
9090
std::vector<int32_t> buildIntersectionTranslationMap(
9191
const StringDictionaryProxy* dest_proxy) const;
9292

93-
std::vector<int32_t> buildUnionTranslationMapToOtherProxy(
94-
StringDictionaryProxy* dest_proxy) const;
93+
std::vector<int32_t> buildUnionTranslationMap(StringDictionaryProxy* dest_proxy) const;
9594

9695
/**
9796
* @brief Returns the number of transient string entries for this proxy,

omniscidb/Tests/StringDictionaryBenchmark.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,7 @@ BENCHMARK_DEFINE_F(
385385
2, true, append_strings_10M_10M_10_randomized_truncated_8M);
386386
dest_proxy->getOrAddBulk(append_strings_10M_10M_10_randomized_truncated_100);
387387
for (auto _ : state) {
388-
auto id_map = source_proxy->buildUnionTranslationMapToOtherProxy(dest_proxy.get());
389-
const size_t num_expected_untranslated_strings =
390-
num_elems - first_n_elems - last_n_elems;
388+
auto id_map = source_proxy->buildUnionTranslationMap(dest_proxy.get());
391389
}
392390
}
393391

omniscidb/Tests/StringDictionaryTest.cpp

+19-18
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,7 @@ TEST(NestedStringDictionary, BuildIntersectionTranslationMap) {
11001100
}
11011101
}
11021102

1103-
TEST(StringDictionaryProxy, BuildUnionTranslationMapToEmptyProxy) {
1103+
TEST(NestedStringDictionary, BuildUnionTranslationMap_Empty) {
11041104
// Todo(todd): Migrate this and intersection translation tests to use
11051105
// approach and methods in BuildUnionTranslationMapToPartialOverlapProxy
11061106
const DictRef dict_ref1(-1, 1);
@@ -1123,15 +1123,14 @@ TEST(StringDictionaryProxy, BuildUnionTranslationMapToEmptyProxy) {
11231123
// First try to union translate to empty dictionary.
11241124
// All strings should end up as transient entries in
11251125
// destination proxy
1126-
std::shared_ptr<StringDictionaryProxy> source_string_dict_proxy =
1127-
std::make_shared<StringDictionaryProxy>(source_string_dict,
1128-
source_string_dict->storageEntryCount());
1129-
std::shared_ptr<StringDictionaryProxy> dest_string_dict_proxy =
1130-
std::make_shared<StringDictionaryProxy>(dest_string_dict,
1131-
dest_string_dict->storageEntryCount());
1126+
std::shared_ptr<StringDictionary> source_string_dict_proxy =
1127+
std::make_shared<StringDictionary>(source_string_dict,
1128+
source_string_dict->storageEntryCount());
1129+
std::shared_ptr<StringDictionary> dest_string_dict_proxy =
1130+
std::make_shared<StringDictionary>(dest_string_dict,
1131+
dest_string_dict->storageEntryCount());
11321132
const auto str_proxy_translation_map =
1133-
source_string_dict_proxy->buildUnionTranslationMapToOtherProxy(
1134-
dest_string_dict_proxy.get());
1133+
source_string_dict_proxy->buildUnionTranslationMap(dest_string_dict_proxy.get());
11351134
ASSERT_FALSE(str_proxy_translation_map.empty());
11361135
const size_t num_ids = str_proxy_translation_map.size();
11371136
ASSERT_EQ(num_ids, source_string_dict_proxy->entryCount());
@@ -1161,7 +1160,7 @@ std::vector<std::string> add_strings_numeric_range(std::shared_ptr<StringDiction
11611160
return strings;
11621161
}
11631162

1164-
std::vector<std::string> add_strings_numeric_range(StringDictionaryProxy& sdp,
1163+
std::vector<std::string> add_strings_numeric_range(StringDictionary& sdp,
11651164
const size_t num_vals,
11661165
const int32_t start_val) {
11671166
CHECK_GE(start_val, sdp.getBaseGeneration());
@@ -1174,8 +1173,8 @@ std::vector<std::string> add_strings_numeric_range(StringDictionaryProxy& sdp,
11741173
return strings;
11751174
}
11761175

1177-
void verify_translation(const StringDictionaryProxy& source_proxy,
1178-
const StringDictionaryProxy& dest_proxy,
1176+
void verify_translation(const StringDictionary& source_proxy,
1177+
const StringDictionary& dest_proxy,
11791178
const std::vector<int32_t>& id_map,
11801179
const std::vector<std::string>& persisted_source_strings,
11811180
const std::vector<std::string>& transient_source_strings,
@@ -1211,7 +1210,7 @@ void verify_translation(const StringDictionaryProxy& source_proxy,
12111210
}
12121211
}
12131212

1214-
TEST(StringDictionaryProxy, BuildUnionTranslationMapToPartialOverlapProxy) {
1213+
TEST(NestedStringDictionary, BuildUnionTranslationMap_PartialOverlap) {
12151214
const DictRef dict_ref1(-1, 1);
12161215
const DictRef dict_ref2(-1, 2);
12171216
std::shared_ptr<StringDictionary> source_sd =
@@ -1236,23 +1235,25 @@ TEST(StringDictionaryProxy, BuildUnionTranslationMapToPartialOverlapProxy) {
12361235
dest_sd, num_dest_persisted_entries, dest_persisted_start_val);
12371236
ASSERT_EQ(dest_sd->storageEntryCount(), num_dest_persisted_entries);
12381237

1239-
StringDictionaryProxy source_sdp(source_sd, source_sd->storageEntryCount());
1240-
StringDictionaryProxy dest_sdp(dest_sd, dest_sd->storageEntryCount());
1238+
StringDictionary source_sdp(source_sd, source_sd->storageEntryCount());
1239+
StringDictionary dest_sdp(dest_sd, dest_sd->storageEntryCount());
12411240
const auto transient_source_strings = add_strings_numeric_range(
12421241
source_sdp, num_source_transient_entries, source_transient_start_val);
12431242
ASSERT_EQ(source_sdp.getBaseDictionary()->getDictId(), 1);
12441243
ASSERT_EQ(source_sdp.getBaseDictionary()->storageEntryCount(),
12451244
num_source_persisted_entries);
1246-
ASSERT_EQ(source_sdp.transientEntryCount(), num_source_transient_entries);
1245+
ASSERT_EQ(source_sdp.entryCount() - source_sdp.getBaseGeneration(),
1246+
num_source_transient_entries);
12471247

12481248
const auto transient_dest_strings = add_strings_numeric_range(
12491249
dest_sdp, num_dest_transient_entries, dest_transient_start_val);
12501250
ASSERT_EQ(dest_sdp.getBaseDictionary()->getDictId(), 2);
12511251
ASSERT_EQ(dest_sdp.getBaseDictionary()->storageEntryCount(),
12521252
num_dest_persisted_entries);
1253-
ASSERT_EQ(dest_sdp.transientEntryCount(), num_dest_transient_entries);
1253+
ASSERT_EQ(dest_sdp.entryCount() - dest_sdp.getBaseGeneration(),
1254+
num_dest_transient_entries);
12541255

1255-
const auto id_map = source_sdp.buildUnionTranslationMapToOtherProxy(&dest_sdp);
1256+
const auto id_map = source_sdp.buildUnionTranslationMap(&dest_sdp);
12561257
ASSERT_EQ(id_map.size(), num_source_persisted_entries + num_source_transient_entries);
12571258

12581259
verify_translation(source_sdp,

0 commit comments

Comments
 (0)