2020-05-06 23:21:13 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
#include <Common/Volnitsky.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2023-02-20 09:10:25 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ARGUMENT_OUT_OF_BOUND;
|
|
|
|
}
|
|
|
|
|
2022-10-24 08:41:29 +00:00
|
|
|
struct ReplaceStringTraits
|
|
|
|
{
|
|
|
|
enum class Replace
|
|
|
|
{
|
|
|
|
First,
|
|
|
|
All
|
|
|
|
};
|
|
|
|
};
|
2023-02-20 09:10:25 +00:00
|
|
|
|
|
|
|
/** Replace one or all occurencies of substring 'needle' to 'replacement'.
|
2020-05-06 23:21:13 +00:00
|
|
|
*/
|
2023-02-20 09:10:25 +00:00
|
|
|
template <typename Name, ReplaceStringTraits::Replace replace>
|
2020-05-06 23:21:13 +00:00
|
|
|
struct ReplaceStringImpl
|
|
|
|
{
|
2023-02-20 09:10:25 +00:00
|
|
|
static constexpr auto name = Name::name;
|
|
|
|
|
|
|
|
static void vectorConstantConstant(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const String & needle,
|
|
|
|
const String & replacement,
|
2020-05-06 23:21:13 +00:00
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
2023-02-20 09:10:25 +00:00
|
|
|
if (needle.empty())
|
|
|
|
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
|
|
|
|
|
|
|
const UInt8 * const begin = haystack_data.data();
|
|
|
|
const UInt8 * const end = haystack_data.data() + haystack_data.size();
|
2020-05-06 23:21:13 +00:00
|
|
|
const UInt8 * pos = begin;
|
|
|
|
|
|
|
|
ColumnString::Offset res_offset = 0;
|
2023-02-20 09:10:25 +00:00
|
|
|
res_data.reserve(haystack_data.size());
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
res_offsets.resize(haystack_size);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
/// The current index in the array of strings.
|
|
|
|
size_t i = 0;
|
|
|
|
|
|
|
|
Volnitsky searcher(needle.data(), needle.size(), end - pos);
|
|
|
|
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end)
|
|
|
|
{
|
|
|
|
const UInt8 * match = searcher.search(pos, end - pos);
|
|
|
|
|
|
|
|
/// Copy the data without changing
|
|
|
|
res_data.resize(res_data.size() + (match - pos));
|
|
|
|
memcpy(&res_data[res_offset], pos, match - pos);
|
|
|
|
|
|
|
|
/// Determine which index it belongs to.
|
2023-02-20 09:10:25 +00:00
|
|
|
while (i < haystack_offsets.size() && begin + haystack_offsets[i] <= match)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2023-02-20 09:10:25 +00:00
|
|
|
res_offsets[i] = res_offset + ((begin + haystack_offsets[i]) - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
res_offset += (match - pos);
|
|
|
|
|
|
|
|
/// If you have reached the end, it's time to stop
|
2023-02-20 09:10:25 +00:00
|
|
|
if (i == haystack_offsets.size())
|
2020-05-06 23:21:13 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/// Is it true that this string no longer needs to perform transformations.
|
|
|
|
bool can_finish_current_string = false;
|
|
|
|
|
|
|
|
/// We check that the entry does not go through the boundaries of strings.
|
2023-02-20 09:10:25 +00:00
|
|
|
if (match + needle.size() < begin + haystack_offsets[i])
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
|
|
|
res_data.resize(res_data.size() + replacement.size());
|
|
|
|
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
|
|
|
|
res_offset += replacement.size();
|
|
|
|
pos = match + needle.size();
|
2022-10-24 08:41:29 +00:00
|
|
|
if constexpr (replace == ReplaceStringTraits::Replace::First)
|
2020-05-06 23:21:13 +00:00
|
|
|
can_finish_current_string = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
pos = match;
|
|
|
|
can_finish_current_string = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (can_finish_current_string)
|
|
|
|
{
|
2023-02-20 09:10:25 +00:00
|
|
|
res_data.resize(res_data.size() + (begin + haystack_offsets[i] - pos));
|
|
|
|
memcpy(&res_data[res_offset], pos, (begin + haystack_offsets[i] - pos));
|
|
|
|
res_offset += (begin + haystack_offsets[i] - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
res_offsets[i] = res_offset;
|
2023-02-20 09:10:25 +00:00
|
|
|
pos = begin + haystack_offsets[i];
|
2020-05-06 23:21:13 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-20 09:10:25 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
|
|
|
static void copyToOutput(
|
|
|
|
const CharT * what_start, size_t what_size,
|
|
|
|
ColumnString::Chars & output, size_t & output_offset)
|
|
|
|
{
|
|
|
|
output.resize(output.size() + what_size);
|
|
|
|
memcpy(&output[output_offset], what_start, what_size);
|
|
|
|
output_offset += what_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorVectorConstant(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const ColumnString::Chars & needle_data,
|
|
|
|
const ColumnString::Offsets & needle_offsets,
|
|
|
|
const String & replacement,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
|
|
|
chassert(haystack_offsets.size() == needle_offsets.size());
|
|
|
|
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
|
|
|
|
res_data.reserve(haystack_data.size());
|
|
|
|
res_offsets.resize(haystack_size);
|
|
|
|
|
|
|
|
ColumnString::Offset res_offset = 0;
|
|
|
|
|
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_needle_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
|
|
|
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
|
|
|
|
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
|
|
|
|
|
|
|
|
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
|
|
|
|
const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1;
|
|
|
|
|
|
|
|
if (cur_needle_length == 0)
|
|
|
|
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
|
|
|
|
|
|
|
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
|
|
|
|
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);
|
|
|
|
|
|
|
|
const auto * last_match = static_cast<UInt8 *>(nullptr);
|
|
|
|
const auto * start_pos = cur_haystack_data;
|
|
|
|
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
|
|
|
|
|
|
|
|
while (start_pos < cur_haystack_end)
|
|
|
|
{
|
|
|
|
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
|
|
|
|
{
|
|
|
|
/// Copy prefix before match
|
|
|
|
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
|
|
|
|
|
|
|
|
/// Insert replacement for match
|
|
|
|
copyToOutput(replacement.data(), replacement.size(), res_data, res_offset);
|
|
|
|
|
|
|
|
last_match = match;
|
|
|
|
start_pos = match + cur_needle_length;
|
|
|
|
|
|
|
|
if constexpr (replace == ReplaceStringTraits::Replace::First)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Copy suffix after last match
|
|
|
|
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
|
|
|
|
: (cur_haystack_end - last_match - cur_needle_length + 1);
|
|
|
|
copyToOutput(start_pos, bytes, res_data, res_offset);
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
|
|
|
prev_haystack_offset = haystack_offsets[i];
|
|
|
|
prev_needle_offset = needle_offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorConstantVector(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const String & needle,
|
|
|
|
const ColumnString::Chars & replacement_data,
|
|
|
|
const ColumnString::Offsets & replacement_offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
|
|
|
chassert(haystack_offsets.size() == replacement_offsets.size());
|
|
|
|
|
|
|
|
if (needle.empty())
|
|
|
|
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
|
|
|
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
|
|
|
|
res_data.reserve(haystack_data.size());
|
|
|
|
res_offsets.resize(haystack_size);
|
|
|
|
|
|
|
|
ColumnString::Offset res_offset = 0;
|
|
|
|
|
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_replacement_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
|
|
|
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
|
|
|
|
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
|
|
|
|
|
|
|
|
const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset];
|
|
|
|
const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1;
|
|
|
|
|
|
|
|
/// Using "slow" "stdlib searcher instead of Volnitsky just to keep things simple
|
|
|
|
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(needle.data(), needle.size());
|
|
|
|
|
|
|
|
const auto * last_match = static_cast<UInt8 *>(nullptr);
|
|
|
|
const auto * start_pos = cur_haystack_data;
|
|
|
|
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
|
|
|
|
|
|
|
|
while (start_pos < cur_haystack_end)
|
|
|
|
{
|
|
|
|
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
|
|
|
|
{
|
|
|
|
/// Copy prefix before match
|
|
|
|
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
|
|
|
|
|
|
|
|
/// Insert replacement for match
|
|
|
|
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);
|
|
|
|
|
|
|
|
last_match = match;
|
|
|
|
start_pos = match + needle.size();
|
|
|
|
|
|
|
|
if constexpr (replace == ReplaceStringTraits::Replace::First)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Copy suffix after last match
|
|
|
|
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
|
|
|
|
: (cur_haystack_end - last_match - needle.size() + 1);
|
|
|
|
copyToOutput(start_pos, bytes, res_data, res_offset);
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
|
|
|
prev_haystack_offset = haystack_offsets[i];
|
|
|
|
prev_replacement_offset = replacement_offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorVectorVector(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const ColumnString::Chars & needle_data,
|
|
|
|
const ColumnString::Offsets & needle_offsets,
|
|
|
|
const ColumnString::Chars & replacement_data,
|
|
|
|
const ColumnString::Offsets & replacement_offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
|
|
|
chassert(haystack_offsets.size() == needle_offsets.size());
|
|
|
|
chassert(needle_offsets.size() == replacement_offsets.size());
|
|
|
|
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
|
|
|
|
res_data.reserve(haystack_data.size());
|
|
|
|
res_offsets.resize(haystack_size);
|
|
|
|
|
|
|
|
ColumnString::Offset res_offset = 0;
|
|
|
|
|
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_needle_offset = 0;
|
|
|
|
size_t prev_replacement_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
|
|
|
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
|
|
|
|
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
|
|
|
|
|
|
|
|
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
|
|
|
|
const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1;
|
|
|
|
|
|
|
|
const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset];
|
|
|
|
const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1;
|
|
|
|
|
|
|
|
if (cur_needle_length == 0)
|
|
|
|
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
|
|
|
|
|
|
|
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
|
|
|
|
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);
|
|
|
|
|
|
|
|
const auto * last_match = static_cast<UInt8 *>(nullptr);
|
|
|
|
const auto * start_pos = cur_haystack_data;
|
|
|
|
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;
|
|
|
|
|
|
|
|
while (start_pos < cur_haystack_end)
|
|
|
|
{
|
|
|
|
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
|
|
|
|
{
|
|
|
|
/// Copy prefix before match
|
|
|
|
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
|
|
|
|
|
|
|
|
/// Insert replacement for match
|
|
|
|
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);
|
|
|
|
|
|
|
|
last_match = match;
|
|
|
|
start_pos = match + cur_needle_length;
|
|
|
|
|
|
|
|
if constexpr (replace == ReplaceStringTraits::Replace::First)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Copy suffix after last match
|
|
|
|
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
|
|
|
|
: (cur_haystack_end - last_match - cur_needle_length + 1);
|
|
|
|
copyToOutput(start_pos, bytes, res_data, res_offset);
|
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
|
|
|
|
prev_haystack_offset = haystack_offsets[i];
|
|
|
|
prev_needle_offset = needle_offsets[i];
|
|
|
|
prev_replacement_offset = replacement_offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
/// Note: this function converts fixed-length strings to variable-length strings
|
|
|
|
/// and each variable-length string should ends with zero byte.
|
2023-02-20 09:10:25 +00:00
|
|
|
static void vectorFixedConstantConstant(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
2020-05-06 23:21:13 +00:00
|
|
|
size_t n,
|
2023-02-20 09:10:25 +00:00
|
|
|
const String & needle,
|
|
|
|
const String & replacement,
|
2020-05-06 23:21:13 +00:00
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
2023-02-20 09:10:25 +00:00
|
|
|
if (needle.empty())
|
|
|
|
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
|
|
|
|
|
|
|
const UInt8 * const begin = haystack_data.data();
|
|
|
|
const UInt8 * const end = haystack_data.data() + haystack_data.size();
|
2020-05-06 23:21:13 +00:00
|
|
|
const UInt8 * pos = begin;
|
|
|
|
|
|
|
|
ColumnString::Offset res_offset = 0;
|
2023-02-20 09:10:25 +00:00
|
|
|
size_t haystack_size = haystack_data.size() / n;
|
|
|
|
res_data.reserve(haystack_data.size());
|
|
|
|
res_offsets.resize(haystack_size);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
/// The current index in the string array.
|
|
|
|
size_t i = 0;
|
|
|
|
|
|
|
|
Volnitsky searcher(needle.data(), needle.size(), end - pos);
|
|
|
|
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end)
|
|
|
|
{
|
|
|
|
const UInt8 * match = searcher.search(pos, end - pos);
|
|
|
|
|
|
|
|
#define COPY_REST_OF_CURRENT_STRING() \
|
|
|
|
do \
|
|
|
|
{ \
|
|
|
|
const size_t len = begin + n * (i + 1) - pos; \
|
|
|
|
res_data.resize(res_data.size() + len + 1); \
|
|
|
|
memcpy(&res_data[res_offset], pos, len); \
|
|
|
|
res_offset += len; \
|
|
|
|
res_data[res_offset++] = 0; \
|
|
|
|
res_offsets[i] = res_offset; \
|
|
|
|
pos = begin + n * (i + 1); \
|
|
|
|
++i; \
|
|
|
|
} while (false)
|
|
|
|
|
|
|
|
/// Copy skipped strings without any changes but
|
|
|
|
/// add zero byte to the end of each string.
|
2023-02-20 09:10:25 +00:00
|
|
|
while (i < haystack_size && begin + n * (i + 1) <= match)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
|
|
|
COPY_REST_OF_CURRENT_STRING();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// If you have reached the end, it's time to stop
|
2023-02-20 09:10:25 +00:00
|
|
|
if (i == haystack_size)
|
2020-05-06 23:21:13 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/// Copy unchanged part of current string.
|
|
|
|
res_data.resize(res_data.size() + (match - pos));
|
|
|
|
memcpy(&res_data[res_offset], pos, match - pos);
|
|
|
|
res_offset += (match - pos);
|
|
|
|
|
|
|
|
/// Is it true that this string no longer needs to perform conversions.
|
|
|
|
bool can_finish_current_string = false;
|
|
|
|
|
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
|
|
|
if (match + needle.size() <= begin + n * (i + 1))
|
|
|
|
{
|
|
|
|
res_data.resize(res_data.size() + replacement.size());
|
|
|
|
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
|
|
|
|
res_offset += replacement.size();
|
|
|
|
pos = match + needle.size();
|
2022-10-24 08:41:29 +00:00
|
|
|
if (replace == ReplaceStringTraits::Replace::First || pos == begin + n * (i + 1))
|
2020-05-06 23:21:13 +00:00
|
|
|
can_finish_current_string = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
pos = match;
|
|
|
|
can_finish_current_string = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (can_finish_current_string)
|
|
|
|
{
|
|
|
|
COPY_REST_OF_CURRENT_STRING();
|
|
|
|
}
|
|
|
|
#undef COPY_REST_OF_CURRENT_STRING
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|