Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/column/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ add_library(Column STATIC
column_view/column_view_base.cpp
column_view/column_view_helper.cpp
variant_column.cpp
german_string.cpp
)
40 changes: 28 additions & 12 deletions be/src/column/binary_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ template <typename T>
void BinaryColumnBase<T>::append(const Slice& str) {
_bytes.insert(_bytes.end(), str.data, str.data + str.size);
_offsets.emplace_back(_bytes.size());
_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand Down Expand Up @@ -79,7 +79,7 @@ void BinaryColumnBase<T>::append(const Column& src, size_t offset, size_t count)
new_offsets[i] += delta;
}

_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand Down Expand Up @@ -146,7 +146,7 @@ void BinaryColumnBase<T>::append_selective(const Column& src, const uint32_t* in
}
_offsets.resize(prev_num_offsets + size);

_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand Down Expand Up @@ -175,7 +175,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const Column& src, uint32_
str_size);
}

_slices_cache = false;
invalidate_slice_cache();
}

//TODO(fzh): optimize copy using SIMD
Expand Down Expand Up @@ -229,7 +229,7 @@ bool BinaryColumnBase<T>::append_strings(const Slice* data, size_t size) {
strings::memcpy_inlined(bytes + offsets[i], p, data[i].size);
}

_slices_cache = false;
invalidate_slice_cache();
return true;
}

Expand Down Expand Up @@ -284,7 +284,7 @@ bool BinaryColumnBase<T>::append_strings_overflow(const Slice* data, size_t size
_offsets.emplace_back(_bytes.size());
}
}
_slices_cache = false;
invalidate_slice_cache();
return true;
}

Expand All @@ -305,7 +305,7 @@ bool BinaryColumnBase<T>::append_continuous_strings(const Slice* data, size_t si
_offsets.emplace_back(new_size);
}
DCHECK_EQ(_bytes.size(), new_size);
_slices_cache = false;
invalidate_slice_cache();
return true;
}

Expand Down Expand Up @@ -359,7 +359,7 @@ void BinaryColumnBase<T>::append_bytes(char* const* data, uint32_t* length, size
for (size_t i = 0; i < size; i++) {
_bytes.insert(_bytes.end(), data[i], data[i] + length[i]);
}
_slices_cache = false;
invalidate_slice_cache();
}

template <typename T, size_t copy_length>
Expand Down Expand Up @@ -399,7 +399,7 @@ void BinaryColumnBase<T>::append_bytes_overflow(char* const* data, uint32_t* len
} else {
append_bytes(data, lengths, size);
}
_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand All @@ -414,7 +414,7 @@ void BinaryColumnBase<T>::append_value_multiple_times(const void* value, size_t
_bytes.insert(_bytes.end(), p, pend);
_offsets.emplace_back(_bytes.size());
}
_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand All @@ -436,6 +436,22 @@ void BinaryColumnBase<T>::_build_slices() const {
_slices_cache = true;
}

template <typename T>
void BinaryColumnBase<T>::_build_german_strings() const {
DCHECK(_offsets.size() > 0);
_german_strings_cache = false;
_german_strings.clear();

const auto num_rows = _offsets.size() - 1;
_german_strings.resize(num_rows);

const auto* base = _bytes.data();
for (auto i = 0; i < num_rows; ++i) {
_german_strings[i] = GermanString(base + _offsets[i], _offsets[i + 1] - _offsets[i]);
}
_german_strings_cache = true;
}

template <typename T>
void BinaryColumnBase<T>::fill_default(const Filter& filter) {
std::vector<uint32_t> indexes;
Expand Down Expand Up @@ -506,7 +522,7 @@ void BinaryColumnBase<T>::assign(size_t n, size_t idx) {
_bytes.insert(_bytes.end(), start, end);
_offsets.emplace_back(_bytes.size());
}
_slices_cache = false;
invalidate_slice_cache();
}

//TODO(kks): improve this
Expand All @@ -519,7 +535,7 @@ void BinaryColumnBase<T>::remove_first_n_values(size_t count) {
auto* binary_column = down_cast<const BinaryColumnBase<T>*>(column.get());
_offsets = std::move(binary_column->_offsets);
_bytes = std::move(binary_column->_bytes);
_slices_cache = false;
invalidate_slice_cache();
}

template <typename T>
Expand Down
25 changes: 24 additions & 1 deletion be/src/column/binary_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "column/bytes.h"
#include "column/column.h"
#include "column/datum.h"
#include "column/german_string.h"
#include "column/vectorized_fwd.h"
#include "common/statusor.h"
#include "gutil/strings/fastmem.h"
Expand Down Expand Up @@ -48,6 +49,7 @@ class BinaryColumnBase final : public CowFactory<ColumnFactory<Column, BinaryCol
};

using Container = Buffer<Slice>;
using GermanStringContainer = Buffer<GermanString>;
using ProxyContainer = BinaryDataProxyContainer;
using ImmContainer = BinaryDataProxyContainer;

Expand Down Expand Up @@ -304,6 +306,20 @@ class BinaryColumnBase final : public CowFactory<ColumnFactory<Column, BinaryCol
return _slices;
}

GermanStringContainer& get_german_strings() {
if (!_german_strings_cache) {
_build_german_strings();
}
return _german_strings;
}

const GermanStringContainer& get_german_strings() const {
if (!_german_strings_cache) {
_build_german_strings();
}
return _german_strings;
}

const BinaryDataProxyContainer& get_proxy_data() const { return _immuable_container; }

Bytes& get_bytes() { return _bytes; }
Expand Down Expand Up @@ -342,7 +358,10 @@ class BinaryColumnBase final : public CowFactory<ColumnFactory<Column, BinaryCol
_slices_cache = false;
}

void invalidate_slice_cache() { _slices_cache = false; }
void invalidate_slice_cache() {
_slices_cache = false;
_german_strings_cache = false;
}

std::string debug_item(size_t idx) const override;

Expand All @@ -366,12 +385,16 @@ class BinaryColumnBase final : public CowFactory<ColumnFactory<Column, BinaryCol

private:
void _build_slices() const;
void _build_german_strings() const;

Bytes _bytes;
Offsets _offsets;

mutable Container _slices;
mutable bool _slices_cache = false;
mutable GermanStringContainer _german_strings;
mutable bool _german_strings_cache = false;

BinaryDataProxyContainer _immuable_container = BinaryDataProxyContainer(*this);
};

Expand Down
103 changes: 103 additions & 0 deletions be/src/column/german_string.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "column/german_string.h"

#include "gutil/strings/fastmem.h"
#include "util/hash_util.hpp"
#include "util/misc.h"
#include "util/raw_container.h"
#include "util/slice.h"

namespace starrocks {

GermanString::GermanString() {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
}

GermanString::GermanString(const starrocks::GermanString& rhs) {
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
}

GermanString::GermanString(const char* str, size_t len, void* ptr) {
if (len <= INLINE_MAX_LENGTH) {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
strings::memcpy_inlined(short_rep.str, str, len);
} else {
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
strings::memcpy_inlined(ptr, str, len);
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
}
this->len = len;
}

GermanString::GermanString(const void* str, size_t len) {
if (len <= INLINE_MAX_LENGTH) {
auto* p = reinterpret_cast<char*>(this);
std::fill(p, p + sizeof(GermanString), 0);
strings::memcpy_inlined(short_rep.str, str, len);
} else {
strings::memcpy_inlined(long_rep.prefix, str, PREFIX_LENGTH);
long_rep.ptr = reinterpret_cast<uintptr_t>(str);
}
this->len = len;
}
GermanString& GermanString::operator=(const Slice& slice) {
*this = GermanString(slice);
return *this;
}

GermanString::GermanString(const GermanString& rhs, void* ptr) {
strings::memcpy_inlined(this, &rhs, sizeof(GermanString));
if (!rhs.is_inline()) {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
const auto* rhs_ptr = reinterpret_cast<const char*>(rhs.long_rep.ptr);
strings::memcpy_inlined(ptr, rhs_ptr, rhs.len);
long_rep.ptr = reinterpret_cast<uintptr_t>(ptr);
}
}

GermanString::operator std::string() const {
if (len <= INLINE_MAX_LENGTH) {
return std::string(short_rep.str, len);
} else {
std::string s;
raw::make_room(&s, len);
char* data = s.data();
// NOLINTNEXTLINE(performance-no-int-to-ptr)
strings::memcpy_inlined(data, reinterpret_cast<const char*>(long_rep.ptr), len);
return s;
}
}

uint32_t GermanString::fnv_hash(uint32_t seed) const {
if (is_inline()) {
return HashUtil::fnv_hash(short_rep.str, len, seed);
} else {
// // NOLINTNEXTLINE(performance-no-int-to-ptr)
return HashUtil::fnv_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
}
}
uint32_t GermanString::crc32_hash(uint32_t seed) const {
if (is_inline()) {
return HashUtil::zlib_crc_hash(short_rep.str, len, seed);
} else {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(long_rep.ptr), len, seed);
}
}

} // namespace starrocks
Loading
Loading