remove opts of memory allocation in function tokens

This commit is contained in:
taiyang-li 2024-04-16 10:29:25 +08:00
parent 34e07f6596
commit 8c0d2cc0fc
12 changed files with 25 additions and 74 deletions

View File

@ -58,7 +58,6 @@ private:
public:
static constexpr auto name = Generator::name;
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionTokens>(context); }
explicit FunctionTokens<Generator>(ContextPtr context)
@ -107,12 +106,8 @@ public:
const ColumnString::Chars & src_chars = col_str->getChars();
const ColumnString::Offsets & src_offsets = col_str->getOffsets();
res_offsets.resize_exact(src_offsets.size());
res_offsets.reserve(src_offsets.size());
res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random.
std::optional<size_t> res_chars_reserve_size = generator.getResultReserveSize();
if (res_chars_reserve_size.has_value())
res_strings_chars.reserve_exact(*res_chars_reserve_size);
else
res_strings_chars.reserve(src_chars.size());
Pos token_begin = nullptr;
@ -122,40 +117,32 @@ public:
ColumnString::Offset current_src_offset = 0;
ColumnArray::Offset current_dst_offset = 0;
ColumnString::Offset current_dst_strings_offset = 0;
for (size_t i = 0; i < size; ++i)
{
Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
current_src_offset = src_offsets[i];
Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;
#define PROCESS_WITH_CUSTOM_RESIZE(RESIZE_METHOD) \
for (size_t i = 0; i < size; ++i) \
{ \
Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]); \
current_src_offset = src_offsets[i]; \
Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1; \
generator.set(pos, end); \
size_t j = 0; \
while (generator.get(token_begin, token_end)) \
{ \
size_t token_size = token_end - token_begin; \
res_strings_chars.RESIZE_METHOD(res_strings_chars.size() + token_size + 1); \
memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size); \
res_strings_chars[current_dst_strings_offset + token_size] = 0; \
current_dst_strings_offset += token_size + 1; \
res_strings_offsets.push_back(current_dst_strings_offset); \
++j; \
} \
current_dst_offset += j; \
res_offsets[i] = current_dst_offset; \
generator.set(pos, end);
size_t j = 0;
while (generator.get(token_begin, token_end))
{
size_t token_size = token_end - token_begin;
res_strings_chars.resize(res_strings_chars.size() + token_size + 1);
memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size);
res_strings_chars[current_dst_strings_offset + token_size] = 0;
current_dst_strings_offset += token_size + 1;
res_strings_offsets.push_back(current_dst_strings_offset);
++j;
}
if (res_chars_reserve_size.has_value())
{
/// If res_chars_reserve_size is not -1, then we are sure that the actual size of res_strings_chars doesn't exceed res_chars_reserve_size.
PROCESS_WITH_CUSTOM_RESIZE(resize_assume_reserved)
}
else
{
PROCESS_WITH_CUSTOM_RESIZE(resize)
current_dst_offset += j;
res_offsets.push_back(current_dst_offset);
}
return col_res;
#undef PROCESS_WITH_CUSTOM_RESIZE
}
else if (col_str_const)
{

View File

@ -39,8 +39,6 @@ public:
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -37,8 +37,6 @@ public:
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substring_behavior*/) {}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -37,8 +37,6 @@ public:
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -36,8 +36,6 @@ public:
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
static constexpr auto strings_argument_position = 0uz;
/// Called for each next string.

View File

@ -47,8 +47,6 @@ public:
max_splits = extractMaxSplits(arguments, 1);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -78,8 +78,6 @@ public:
matches.resize(capture + 1);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -34,7 +34,6 @@ private:
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
std::optional<size_t> result_reserve_size;
public:
static constexpr auto name = "splitByChar";
@ -67,21 +66,6 @@ public:
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 2);
const ColumnString * col_str = checkAndGetColumn<ColumnString>(arguments[strings_argument_position].column.get());
/// There is another possibility that the input column is ColumnConst. We ignore it because there is no need to get reserve size under such condition.
if (col_str)
{
const ColumnString::Chars & src_chars = col_str->getChars();
/// Consider use case: splitByChar(' ', 'a b c'), where input chars is "a b c\0", output chars is "a\0", "b\0", "c\0".
/// The size of output chars should never exceeds input chars
result_reserve_size = src_chars.size();
}
}
std::optional<size_t> getResultReserveSize() const
{
return result_reserve_size;
}
void set(Pos pos_, Pos end_)
@ -93,7 +77,7 @@ public:
bool get(Pos & token_begin, Pos & token_end)
{
if (!pos) [[unlikely]]
if (!pos)
return false;
token_begin = pos;

View File

@ -57,8 +57,6 @@ public:
max_splits = extractMaxSplits(arguments, 1);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -70,8 +70,6 @@ public:
max_splits = extractMaxSplits(arguments, 2);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -62,8 +62,6 @@ public:
max_splits = extractMaxSplits(arguments, 2);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{

View File

@ -45,8 +45,6 @@ public:
max_splits = extractMaxSplits(arguments, 1);
}
std::optional<size_t> getResultReserveSize() const { return std::nullopt; }
/// Called for each next string.
void set(Pos pos_, Pos end_)
{