#pragma once #include #include #include namespace DB { namespace ErrorCodes { extern const int ARGUMENT_OUT_OF_BOUND; } struct ReplaceStringTraits { enum class Replace { First, All }; }; /** Replace one or all occurencies of substring 'needle' to 'replacement'. */ template struct ReplaceStringImpl { static constexpr auto name = Name::name; static void vectorConstantConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const String & needle, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { if (needle.empty()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); const UInt8 * const begin = haystack_data.data(); const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; ColumnString::Offset res_offset = 0; res_data.reserve(haystack_data.size()); const size_t haystack_size = haystack_offsets.size(); res_offsets.resize(haystack_size); /// The current index in the array of strings. size_t i = 0; Volnitsky searcher(needle.data(), needle.size(), end - pos); /// We will search for the next occurrence in all rows at once. while (pos < end) { const UInt8 * match = searcher.search(pos, end - pos); /// Copy the data without changing res_data.resize(res_data.size() + (match - pos)); memcpy(&res_data[res_offset], pos, match - pos); /// Determine which index it belongs to. while (i < haystack_offsets.size() && begin + haystack_offsets[i] <= match) { res_offsets[i] = res_offset + ((begin + haystack_offsets[i]) - pos); ++i; } res_offset += (match - pos); /// If you have reached the end, it's time to stop if (i == haystack_offsets.size()) break; /// Is it true that this string no longer needs to perform transformations. bool can_finish_current_string = false; /// We check that the entry does not go through the boundaries of strings. if (match + needle.size() < begin + haystack_offsets[i]) { res_data.resize(res_data.size() + replacement.size()); memcpy(&res_data[res_offset], replacement.data(), replacement.size()); res_offset += replacement.size(); pos = match + needle.size(); if constexpr (replace == ReplaceStringTraits::Replace::First) can_finish_current_string = true; } else { pos = match; can_finish_current_string = true; } if (can_finish_current_string) { res_data.resize(res_data.size() + (begin + haystack_offsets[i] - pos)); memcpy(&res_data[res_offset], pos, (begin + haystack_offsets[i] - pos)); res_offset += (begin + haystack_offsets[i] - pos); res_offsets[i] = res_offset; pos = begin + haystack_offsets[i]; ++i; } } } template requires (sizeof(CharT) == 1) static void copyToOutput( const CharT * what_start, size_t what_size, ColumnString::Chars & output, ColumnString::Offset & output_offset) { output.resize(output.size() + what_size); memcpy(&output[output_offset], what_start, what_size); output_offset += what_size; } static void vectorVectorConstant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { chassert(haystack_offsets.size() == needle_offsets.size()); const size_t haystack_size = haystack_offsets.size(); res_data.reserve(haystack_data.size()); res_offsets.resize(haystack_size); ColumnString::Offset res_offset = 0; size_t prev_haystack_offset = 0; size_t prev_needle_offset = 0; for (size_t i = 0; i < haystack_size; ++i) { const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; const auto * const cur_needle_data = &needle_data[prev_needle_offset]; const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; if (cur_needle_length == 0) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); const auto * last_match = static_cast(nullptr); const auto * start_pos = cur_haystack_data; const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; while (start_pos < cur_haystack_end) { if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) { /// Copy prefix before match copyToOutput(start_pos, match - start_pos, res_data, res_offset); /// Insert replacement for match copyToOutput(replacement.data(), replacement.size(), res_data, res_offset); last_match = match; start_pos = match + cur_needle_length; if constexpr (replace == ReplaceStringTraits::Replace::First) break; } else break; } /// Copy suffix after last match size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) : (cur_haystack_end - last_match - cur_needle_length + 1); copyToOutput(start_pos, bytes, res_data, res_offset); res_offsets[i] = res_offset; prev_haystack_offset = haystack_offsets[i]; prev_needle_offset = needle_offsets[i]; } } static void vectorConstantVector( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const String & needle, const ColumnString::Chars & replacement_data, const ColumnString::Offsets & replacement_offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { chassert(haystack_offsets.size() == replacement_offsets.size()); if (needle.empty()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); const size_t haystack_size = haystack_offsets.size(); res_data.reserve(haystack_data.size()); res_offsets.resize(haystack_size); ColumnString::Offset res_offset = 0; size_t prev_haystack_offset = 0; size_t prev_replacement_offset = 0; for (size_t i = 0; i < haystack_size; ++i) { const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; /// Using "slow" "stdlib searcher instead of Volnitsky just to keep things simple StdLibASCIIStringSearcher searcher(needle.data(), needle.size()); const auto * last_match = static_cast(nullptr); const auto * start_pos = cur_haystack_data; const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; while (start_pos < cur_haystack_end) { if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) { /// Copy prefix before match copyToOutput(start_pos, match - start_pos, res_data, res_offset); /// Insert replacement for match copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); last_match = match; start_pos = match + needle.size(); if constexpr (replace == ReplaceStringTraits::Replace::First) break; } else break; } /// Copy suffix after last match size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) : (cur_haystack_end - last_match - needle.size() + 1); copyToOutput(start_pos, bytes, res_data, res_offset); res_offsets[i] = res_offset; prev_haystack_offset = haystack_offsets[i]; prev_replacement_offset = replacement_offsets[i]; } } static void vectorVectorVector( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, const ColumnString::Chars & replacement_data, const ColumnString::Offsets & replacement_offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { chassert(haystack_offsets.size() == needle_offsets.size()); chassert(needle_offsets.size() == replacement_offsets.size()); const size_t haystack_size = haystack_offsets.size(); res_data.reserve(haystack_data.size()); res_offsets.resize(haystack_size); ColumnString::Offset res_offset = 0; size_t prev_haystack_offset = 0; size_t prev_needle_offset = 0; size_t prev_replacement_offset = 0; for (size_t i = 0; i < haystack_size; ++i) { const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; const auto * const cur_needle_data = &needle_data[prev_needle_offset]; const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; if (cur_needle_length == 0) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); const auto * last_match = static_cast(nullptr); const auto * start_pos = cur_haystack_data; const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; while (start_pos < cur_haystack_end) { if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) { /// Copy prefix before match copyToOutput(start_pos, match - start_pos, res_data, res_offset); /// Insert replacement for match copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); last_match = match; start_pos = match + cur_needle_length; if constexpr (replace == ReplaceStringTraits::Replace::First) break; } else break; } /// Copy suffix after last match size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) : (cur_haystack_end - last_match - cur_needle_length + 1); copyToOutput(start_pos, bytes, res_data, res_offset); res_offsets[i] = res_offset; prev_haystack_offset = haystack_offsets[i]; prev_needle_offset = needle_offsets[i]; prev_replacement_offset = replacement_offsets[i]; } } /// Note: this function converts fixed-length strings to variable-length strings /// and each variable-length string should ends with zero byte. static void vectorFixedConstantConstant( const ColumnString::Chars & haystack_data, size_t n, const String & needle, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { if (needle.empty()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); const UInt8 * const begin = haystack_data.data(); const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; ColumnString::Offset res_offset = 0; size_t haystack_size = haystack_data.size() / n; res_data.reserve(haystack_data.size()); res_offsets.resize(haystack_size); /// The current index in the string array. size_t i = 0; Volnitsky searcher(needle.data(), needle.size(), end - pos); /// We will search for the next occurrence in all rows at once. while (pos < end) { const UInt8 * match = searcher.search(pos, end - pos); #define COPY_REST_OF_CURRENT_STRING() \ do \ { \ const size_t len = begin + n * (i + 1) - pos; \ res_data.resize(res_data.size() + len + 1); \ memcpy(&res_data[res_offset], pos, len); \ res_offset += len; \ res_data[res_offset++] = 0; \ res_offsets[i] = res_offset; \ pos = begin + n * (i + 1); \ ++i; \ } while (false) /// Copy skipped strings without any changes but /// add zero byte to the end of each string. while (i < haystack_size && begin + n * (i + 1) <= match) { COPY_REST_OF_CURRENT_STRING(); } /// If you have reached the end, it's time to stop if (i == haystack_size) break; /// Copy unchanged part of current string. res_data.resize(res_data.size() + (match - pos)); memcpy(&res_data[res_offset], pos, match - pos); res_offset += (match - pos); /// Is it true that this string no longer needs to perform conversions. bool can_finish_current_string = false; /// We check that the entry does not pass through the boundaries of strings. if (match + needle.size() <= begin + n * (i + 1)) { res_data.resize(res_data.size() + replacement.size()); memcpy(&res_data[res_offset], replacement.data(), replacement.size()); res_offset += replacement.size(); pos = match + needle.size(); if (replace == ReplaceStringTraits::Replace::First || pos == begin + n * (i + 1)) can_finish_current_string = true; } else { pos = match; can_finish_current_string = true; } if (can_finish_current_string) { COPY_REST_OF_CURRENT_STRING(); } #undef COPY_REST_OF_CURRENT_STRING } } }; }