ClickHouse/src/Functions/ReplaceStringImpl.h
2023-04-05 12:07:22 +00:00

430 lines
16 KiB
C++

#pragma once
#include <base/types.h>
#include <Common/Volnitsky.h>
#include <Columns/ColumnString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
}
struct ReplaceStringTraits
{
enum class Replace
{
First,
All
};
};
/** Replace one or all occurencies of substring 'needle' to 'replacement'.
*/
template <typename Name, ReplaceStringTraits::Replace replace>
struct ReplaceStringImpl
{
static constexpr auto name = Name::name;
static void vectorConstantConstant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const String & needle,
const String & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
const UInt8 * const begin = haystack_data.data();
const UInt8 * const end = haystack_data.data() + haystack_data.size();
const UInt8 * pos = begin;
ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
const size_t haystack_size = haystack_offsets.size();
res_offsets.resize(haystack_size);
/// The current index in the array of strings.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
/// Copy the data without changing
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
/// Determine which index it belongs to.
while (i < haystack_offsets.size() && begin + haystack_offsets[i] <= match)
{
res_offsets[i] = res_offset + ((begin + haystack_offsets[i]) - pos);
++i;
}
res_offset += (match - pos);
/// If you have reached the end, it's time to stop
if (i == haystack_offsets.size())
break;
/// Is it true that this string no longer needs to perform transformations.
bool can_finish_current_string = false;
/// We check that the entry does not go through the boundaries of strings.
if (match + needle.size() < begin + haystack_offsets[i])
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if constexpr (replace == ReplaceStringTraits::Replace::First)
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
res_data.resize(res_data.size() + (begin + haystack_offsets[i] - pos));
memcpy(&res_data[res_offset], pos, (begin + haystack_offsets[i] - pos));
res_offset += (begin + haystack_offsets[i] - pos);
res_offsets[i] = res_offset;
pos = begin + haystack_offsets[i];
++i;
}
}
}
template <typename CharT>
requires (sizeof(CharT) == 1)
static void copyToOutput(
const CharT * what_start, size_t what_size,
ColumnString::Chars & output, ColumnString::Offset & output_offset)
{
output.resize(output.size() + what_size);
memcpy(&output[output_offset], what_start, what_size);
output_offset += what_size;
}
static void vectorVectorConstant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets,
const String & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
chassert(haystack_offsets.size() == needle_offsets.size());
const size_t haystack_size = haystack_offsets.size();
res_data.reserve(haystack_data.size());
res_offsets.resize(haystack_size);
ColumnString::Offset res_offset = 0;
size_t prev_haystack_offset = 0;
size_t prev_needle_offset = 0;
for (size_t i = 0; i < haystack_size; ++i)
{
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1;
if (cur_needle_length == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);
const auto * last_match = static_cast<UInt8 *>(nullptr);
const auto * start_pos = cur_haystack_data;
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
while (start_pos < cur_haystack_end)
{
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
/// Insert replacement for match
copyToOutput(replacement.data(), replacement.size(), res_data, res_offset);
last_match = match;
start_pos = match + cur_needle_length;
if constexpr (replace == ReplaceStringTraits::Replace::First)
break;
}
else
break;
}
/// Copy suffix after last match
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
: (cur_haystack_end - last_match - cur_needle_length + 1);
copyToOutput(start_pos, bytes, res_data, res_offset);
res_offsets[i] = res_offset;
prev_haystack_offset = haystack_offsets[i];
prev_needle_offset = needle_offsets[i];
}
}
static void vectorConstantVector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const String & needle,
const ColumnString::Chars & replacement_data,
const ColumnString::Offsets & replacement_offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
chassert(haystack_offsets.size() == replacement_offsets.size());
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
const size_t haystack_size = haystack_offsets.size();
res_data.reserve(haystack_data.size());
res_offsets.resize(haystack_size);
ColumnString::Offset res_offset = 0;
size_t prev_haystack_offset = 0;
size_t prev_replacement_offset = 0;
for (size_t i = 0; i < haystack_size; ++i)
{
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset];
const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1;
/// Using "slow" "stdlib searcher instead of Volnitsky just to keep things simple
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(needle.data(), needle.size());
const auto * last_match = static_cast<UInt8 *>(nullptr);
const auto * start_pos = cur_haystack_data;
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
while (start_pos < cur_haystack_end)
{
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
/// Insert replacement for match
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);
last_match = match;
start_pos = match + needle.size();
if constexpr (replace == ReplaceStringTraits::Replace::First)
break;
}
else
break;
}
/// Copy suffix after last match
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
: (cur_haystack_end - last_match - needle.size() + 1);
copyToOutput(start_pos, bytes, res_data, res_offset);
res_offsets[i] = res_offset;
prev_haystack_offset = haystack_offsets[i];
prev_replacement_offset = replacement_offsets[i];
}
}
static void vectorVectorVector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets,
const ColumnString::Chars & replacement_data,
const ColumnString::Offsets & replacement_offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
chassert(haystack_offsets.size() == needle_offsets.size());
chassert(needle_offsets.size() == replacement_offsets.size());
const size_t haystack_size = haystack_offsets.size();
res_data.reserve(haystack_data.size());
res_offsets.resize(haystack_size);
ColumnString::Offset res_offset = 0;
size_t prev_haystack_offset = 0;
size_t prev_needle_offset = 0;
size_t prev_replacement_offset = 0;
for (size_t i = 0; i < haystack_size; ++i)
{
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1;
const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset];
const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1;
if (cur_needle_length == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);
const auto * last_match = static_cast<UInt8 *>(nullptr);
const auto * start_pos = cur_haystack_data;
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
while (start_pos < cur_haystack_end)
{
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
/// Insert replacement for match
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);
last_match = match;
start_pos = match + cur_needle_length;
if constexpr (replace == ReplaceStringTraits::Replace::First)
break;
}
else
break;
}
/// Copy suffix after last match
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
: (cur_haystack_end - last_match - cur_needle_length + 1);
copyToOutput(start_pos, bytes, res_data, res_offset);
res_offsets[i] = res_offset;
prev_haystack_offset = haystack_offsets[i];
prev_needle_offset = needle_offsets[i];
prev_replacement_offset = replacement_offsets[i];
}
}
/// Note: this function converts fixed-length strings to variable-length strings
/// and each variable-length string should ends with zero byte.
static void vectorFixedConstantConstant(
const ColumnString::Chars & haystack_data,
size_t n,
const String & needle,
const String & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
const UInt8 * const begin = haystack_data.data();
const UInt8 * const end = haystack_data.data() + haystack_data.size();
const UInt8 * pos = begin;
ColumnString::Offset res_offset = 0;
size_t haystack_size = haystack_data.size() / n;
res_data.reserve(haystack_data.size());
res_offsets.resize(haystack_size);
/// The current index in the string array.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
#define COPY_REST_OF_CURRENT_STRING() \
do \
{ \
const size_t len = begin + n * (i + 1) - pos; \
res_data.resize(res_data.size() + len + 1); \
memcpy(&res_data[res_offset], pos, len); \
res_offset += len; \
res_data[res_offset++] = 0; \
res_offsets[i] = res_offset; \
pos = begin + n * (i + 1); \
++i; \
} while (false)
/// Copy skipped strings without any changes but
/// add zero byte to the end of each string.
while (i < haystack_size && begin + n * (i + 1) <= match)
{
COPY_REST_OF_CURRENT_STRING();
}
/// If you have reached the end, it's time to stop
if (i == haystack_size)
break;
/// Copy unchanged part of current string.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += (match - pos);
/// Is it true that this string no longer needs to perform conversions.
bool can_finish_current_string = false;
/// We check that the entry does not pass through the boundaries of strings.
if (match + needle.size() <= begin + n * (i + 1))
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if (replace == ReplaceStringTraits::Replace::First || pos == begin + n * (i + 1))
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
COPY_REST_OF_CURRENT_STRING();
}
#undef COPY_REST_OF_CURRENT_STRING
}
}
};
}