mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
177 lines
5.8 KiB
C++
177 lines
5.8 KiB
C++
#pragma once
|
|
|
|
#include <base/types.h>
|
|
#include <Common/Volnitsky.h>
|
|
#include <Columns/ColumnString.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants.
|
|
*/
|
|
template <bool replace_one = false>
|
|
struct ReplaceStringImpl
|
|
{
|
|
static void vector(
|
|
const ColumnString::Chars & data,
|
|
const ColumnString::Offsets & offsets,
|
|
const std::string & needle,
|
|
const std::string & replacement,
|
|
ColumnString::Chars & res_data,
|
|
ColumnString::Offsets & res_offsets)
|
|
{
|
|
const UInt8 * begin = data.data();
|
|
const UInt8 * pos = begin;
|
|
const UInt8 * end = pos + data.size();
|
|
|
|
ColumnString::Offset res_offset = 0;
|
|
res_data.reserve(data.size());
|
|
size_t size = offsets.size();
|
|
res_offsets.resize(size);
|
|
|
|
/// The current index in the array of strings.
|
|
size_t i = 0;
|
|
|
|
Volnitsky searcher(needle.data(), needle.size(), end - pos);
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
while (pos < end)
|
|
{
|
|
const UInt8 * match = searcher.search(pos, end - pos);
|
|
|
|
/// Copy the data without changing
|
|
res_data.resize(res_data.size() + (match - pos));
|
|
memcpy(&res_data[res_offset], pos, match - pos);
|
|
|
|
/// Determine which index it belongs to.
|
|
while (i < offsets.size() && begin + offsets[i] <= match)
|
|
{
|
|
res_offsets[i] = res_offset + ((begin + offsets[i]) - pos);
|
|
++i;
|
|
}
|
|
res_offset += (match - pos);
|
|
|
|
/// If you have reached the end, it's time to stop
|
|
if (i == offsets.size())
|
|
break;
|
|
|
|
/// Is it true that this string no longer needs to perform transformations.
|
|
bool can_finish_current_string = false;
|
|
|
|
/// We check that the entry does not go through the boundaries of strings.
|
|
if (match + needle.size() < begin + offsets[i])
|
|
{
|
|
res_data.resize(res_data.size() + replacement.size());
|
|
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
|
|
res_offset += replacement.size();
|
|
pos = match + needle.size();
|
|
if (replace_one)
|
|
can_finish_current_string = true;
|
|
}
|
|
else
|
|
{
|
|
pos = match;
|
|
can_finish_current_string = true;
|
|
}
|
|
|
|
if (can_finish_current_string)
|
|
{
|
|
res_data.resize(res_data.size() + (begin + offsets[i] - pos));
|
|
memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos));
|
|
res_offset += (begin + offsets[i] - pos);
|
|
res_offsets[i] = res_offset;
|
|
pos = begin + offsets[i];
|
|
++i;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Note: this function converts fixed-length strings to variable-length strings
|
|
/// and each variable-length string should ends with zero byte.
|
|
static void vectorFixed(
|
|
const ColumnString::Chars & data,
|
|
size_t n,
|
|
const std::string & needle,
|
|
const std::string & replacement,
|
|
ColumnString::Chars & res_data,
|
|
ColumnString::Offsets & res_offsets)
|
|
{
|
|
const UInt8 * begin = data.data();
|
|
const UInt8 * pos = begin;
|
|
const UInt8 * end = pos + data.size();
|
|
|
|
ColumnString::Offset res_offset = 0;
|
|
size_t count = data.size() / n;
|
|
res_data.reserve(data.size());
|
|
res_offsets.resize(count);
|
|
|
|
/// The current index in the string array.
|
|
size_t i = 0;
|
|
|
|
Volnitsky searcher(needle.data(), needle.size(), end - pos);
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
while (pos < end)
|
|
{
|
|
const UInt8 * match = searcher.search(pos, end - pos);
|
|
|
|
#define COPY_REST_OF_CURRENT_STRING() \
|
|
do \
|
|
{ \
|
|
const size_t len = begin + n * (i + 1) - pos; \
|
|
res_data.resize(res_data.size() + len + 1); \
|
|
memcpy(&res_data[res_offset], pos, len); \
|
|
res_offset += len; \
|
|
res_data[res_offset++] = 0; \
|
|
res_offsets[i] = res_offset; \
|
|
pos = begin + n * (i + 1); \
|
|
++i; \
|
|
} while (false)
|
|
|
|
/// Copy skipped strings without any changes but
|
|
/// add zero byte to the end of each string.
|
|
while (i < count && begin + n * (i + 1) <= match)
|
|
{
|
|
COPY_REST_OF_CURRENT_STRING();
|
|
}
|
|
|
|
/// If you have reached the end, it's time to stop
|
|
if (i == count)
|
|
break;
|
|
|
|
/// Copy unchanged part of current string.
|
|
res_data.resize(res_data.size() + (match - pos));
|
|
memcpy(&res_data[res_offset], pos, match - pos);
|
|
res_offset += (match - pos);
|
|
|
|
/// Is it true that this string no longer needs to perform conversions.
|
|
bool can_finish_current_string = false;
|
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
|
if (match + needle.size() <= begin + n * (i + 1))
|
|
{
|
|
res_data.resize(res_data.size() + replacement.size());
|
|
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
|
|
res_offset += replacement.size();
|
|
pos = match + needle.size();
|
|
if (replace_one || pos == begin + n * (i + 1))
|
|
can_finish_current_string = true;
|
|
}
|
|
else
|
|
{
|
|
pos = match;
|
|
can_finish_current_string = true;
|
|
}
|
|
|
|
if (can_finish_current_string)
|
|
{
|
|
COPY_REST_OF_CURRENT_STRING();
|
|
}
|
|
#undef COPY_REST_OF_CURRENT_STRING
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|