diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 50e15f70f5d..d4c7c451af2 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -13,17 +13,18 @@ Functions for [searching](../../sql-reference/functions/string-search-functions. ## replaceOne(haystack, pattern, replacement) Replaces the first occurrence of the substring ‘pattern’ (if it exists) in ‘haystack’ by the ‘replacement’ string. -‘pattern’ and ‘replacement’ must be constants. ## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) Replaces all occurrences of the substring ‘pattern’ in ‘haystack’ by the ‘replacement’ string. +Alias: `replace`. + ## replaceRegexpOne(haystack, pattern, replacement) Replaces the first occurrence of the substring matching the regular expression ‘pattern’ in ‘haystack‘ by the ‘replacement‘ string. -‘pattern‘ must be a constant [re2 regular expression](https://github.com/google/re2/wiki/Syntax). -‘replacement’ must be a plain constant string or a constant string containing substitutions `\0-\9`. +‘pattern‘ must be a [re2 regular expression](https://github.com/google/re2/wiki/Syntax). +‘replacement’ must be a plain string or a string containing substitutions `\0-\9`. Substitutions `\1-\9` correspond to the 1st to 9th capturing group (submatch), substitution `\0` corresponds to the entire match. To use a verbatim `\` character in the ‘pattern‘ or ‘replacement‘ string, escape it using `\`. Also keep in mind that string literals require an extra escaping. @@ -88,6 +89,8 @@ SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res └─────────────────────┘ ``` +Alias: `REGEXP_REPLACE`. + ## regexpQuoteMeta(s) The function adds a backslash before some predefined characters in the string. diff --git a/src/Functions/FunctionStringReplace.h b/src/Functions/FunctionStringReplace.h index f90eac2e7f3..6199e146210 100644 --- a/src/Functions/FunctionStringReplace.h +++ b/src/Functions/FunctionStringReplace.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -13,16 +14,14 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ARGUMENT_OUT_OF_BOUND; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; } - template class FunctionStringReplace : public IFunction { public: static constexpr auto name = Name::name; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -32,65 +31,80 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isStringOrFixedString(arguments[0])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument of function {}", - arguments[0]->getName(), getName()); + FunctionArgumentDescriptors args{ + {"haystack", &isStringOrFixedString, nullptr, "String or FixedString"}, + {"pattern", &isString, nullptr, "String"}, + {"replacement", &isString, nullptr, "String"} + }; - if (!isStringOrFixedString(arguments[1])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument of function {}", - arguments[1]->getName(), getName()); - - if (!isStringOrFixedString(arguments[2])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of third argument of function {}", - arguments[2]->getName(), getName()); + validateFunctionArgumentTypes(*this, arguments, args); return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - const ColumnPtr column_src = arguments[0].column; + const ColumnPtr column_haystack = arguments[0].column; const ColumnPtr column_needle = arguments[1].column; const ColumnPtr column_replacement = arguments[2].column; - if (!isColumnConst(*column_needle) || !isColumnConst(*column_replacement)) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "2nd and 3rd arguments of function {} must be constants.", - getName()); + const ColumnString * col_haystack = checkAndGetColumn(column_haystack.get()); + const ColumnFixedString * col_haystack_fixed = checkAndGetColumn(column_haystack.get()); - const IColumn * c1 = arguments[1].column.get(); - const IColumn * c2 = arguments[2].column.get(); - const ColumnConst * c1_const = typeid_cast(c1); - const ColumnConst * c2_const = typeid_cast(c2); - String needle = c1_const->getValue(); - String replacement = c2_const->getValue(); + const ColumnString * col_needle_vector = checkAndGetColumn(column_needle.get()); + const ColumnConst * col_needle_const = checkAndGetColumn(column_needle.get()); - if (needle.empty()) - throw Exception( - ErrorCodes::ARGUMENT_OUT_OF_BOUND, - "Length of the second argument of function replace must be greater than 0."); + const ColumnString * col_replacement_vector = checkAndGetColumn(column_replacement.get()); + const ColumnConst * col_replacement_const = checkAndGetColumn(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + auto col_res = ColumnString::create(); + + if (col_haystack && col_needle_const && col_replacement_const) { - auto col_res = ColumnString::create(); - Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, col_res->getChars(), col_res->getOffsets()); + Impl::vectorConstantConstant( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_const->getValue(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); return col_res; } - else if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_src.get())) + else if (col_haystack && col_needle_vector && col_replacement_const) { - auto col_res = ColumnString::create(); - Impl::vectorFixed(col_fixed->getChars(), col_fixed->getN(), needle, replacement, col_res->getChars(), col_res->getOffsets()); + Impl::vectorVectorConstant( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_vector->getChars(), col_needle_vector->getOffsets(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack && col_needle_const && col_replacement_vector) + { + Impl::vectorConstantVector( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_const->getValue(), + col_replacement_vector->getChars(), col_replacement_vector->getOffsets(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack && col_needle_vector && col_replacement_vector) + { + Impl::vectorVectorVector( + col_haystack->getChars(), col_haystack->getOffsets(), + col_needle_vector->getChars(), col_needle_vector->getOffsets(), + col_replacement_vector->getChars(), col_replacement_vector->getOffsets(), + col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else if (col_haystack_fixed && col_needle_const && col_replacement_const) + { + Impl::vectorFixedConstantConstant( + col_haystack_fixed->getChars(), col_haystack_fixed->getN(), + col_needle_const->getValue(), + col_replacement_const->getValue(), + col_res->getChars(), col_res->getOffsets()); return col_res; } else diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index 88d7a40d2dd..7e3af1e62d9 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -13,6 +13,7 @@ namespace DB namespace ErrorCodes { + extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; } @@ -28,9 +29,11 @@ struct ReplaceRegexpTraits /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. * 'replacement' can contain substitutions, for example: '\2-\3-\1' */ -template +template struct ReplaceRegexpImpl { + static constexpr auto name = Name::name; + struct Instruction { /// If not negative, perform substitution of n-th subpattern from the regexp match. @@ -162,18 +165,21 @@ struct ReplaceRegexpImpl ++res_offset; } - static void vector( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + static void vectorConstantConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, const String & needle, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); re2_st::RE2::Options regexp_options; /// Don't write error messages to stderr. @@ -182,39 +188,89 @@ struct ReplaceRegexpImpl re2_st::RE2 searcher(needle, regexp_options); if (!searcher.ok()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "The pattern argument is not a valid re2 pattern: {}", - searcher.error()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); /// Cannot perform search for whole columns. Will process each string separately. - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { - size_t from = i > 0 ? offsets[i - 1] : 0; - const char * haystack_data = reinterpret_cast(data.data() + from); - const size_t haystack_length = static_cast(offsets[i] - from - 1); + size_t from = i > 0 ? haystack_offsets[i - 1] : 0; - processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); + const char * hs_data = reinterpret_cast(haystack_data.data() + from); + const size_t hs_length = static_cast(haystack_offsets[i] - from - 1); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } - static void vectorFixed( - const ColumnString::Chars & data, - size_t n, - const String & needle, + static void vectorVectorConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { + assert(haystack_offsets.size() == needle_offsets.size()); + ColumnString::Offset res_offset = 0; - size_t size = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); + const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); + std::string_view needle(ndl_data, ndl_length); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + re2_st::RE2 searcher(needle, regexp_options); + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + Instructions instructions = createInstructions(replacement, num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorConstantVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + assert(haystack_offsets.size() == replacement_offsets.size()); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + ColumnString::Offset res_offset = 0; + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); re2_st::RE2::Options regexp_options; /// Don't write error messages to stderr. @@ -223,22 +279,116 @@ struct ReplaceRegexpImpl re2_st::RE2 searcher(needle, regexp_options); if (!searcher.ok()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "The pattern argument is not a valid re2 pattern: {}", - searcher.error()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); + const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); + + Instructions instructions = createInstructions(std::string_view(repl_data, repl_length), num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorVectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + assert(haystack_offsets.size() == needle_offsets.size()); + assert(needle_offsets.size() == replacement_offsets.size()); + + ColumnString::Offset res_offset = 0; + res_data.reserve(haystack_data.size()); + size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + /// Cannot perform search for whole columns. Will process each string separately. + for (size_t i = 0; i < haystack_size; ++i) + { + size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0; + const char * hs_data = reinterpret_cast(haystack_data.data() + hs_from); + const size_t hs_length = static_cast(haystack_offsets[i] - hs_from - 1); + + size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0; + const char * ndl_data = reinterpret_cast(needle_data.data() + ndl_from); + const size_t ndl_length = static_cast(needle_offsets[i] - ndl_from - 1); + std::string_view needle(ndl_data, ndl_length); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0; + const char * repl_data = reinterpret_cast(replacement_data.data() + repl_from); + const size_t repl_length = static_cast(replacement_offsets[i] - repl_from - 1); + + re2_st::RE2 searcher(needle, regexp_options); + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); + Instructions instructions = createInstructions(std::string_view(repl_data, repl_length), num_captures); + + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); + res_offsets[i] = res_offset; + } + } + + static void vectorFixedConstantConstant( + const ColumnString::Chars & haystack_data, + size_t n, + const String & needle, + const String & replacement, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + ColumnString::Offset res_offset = 0; + size_t haystack_size = haystack_data.size() / n; + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. + regexp_options.set_log_errors(false); + + re2_st::RE2 searcher(needle, regexp_options); + + if (!searcher.ok()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The pattern argument is not a valid re2 pattern: {}", searcher.error()); int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { size_t from = i * n; - const char * haystack_data = reinterpret_cast(data.data() + from); - const size_t haystack_length = n; + const char * hs_data = reinterpret_cast(haystack_data.data() + from); + const size_t hs_length = n; - processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); + processString(hs_data, hs_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } diff --git a/src/Functions/ReplaceStringImpl.h b/src/Functions/ReplaceStringImpl.h index 1a9ec49c58c..186348d7d53 100644 --- a/src/Functions/ReplaceStringImpl.h +++ b/src/Functions/ReplaceStringImpl.h @@ -8,6 +8,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + struct ReplaceStringTraits { enum class Replace @@ -16,27 +21,33 @@ struct ReplaceStringTraits All }; }; -/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. + +/** Replace one or all occurencies of substring 'needle' to 'replacement'. */ -template +template struct ReplaceStringImpl { - static void vector( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, + static constexpr auto name = Name::name; + + static void vectorConstantConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - const UInt8 * begin = data.data(); + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); ColumnString::Offset res_offset = 0; - res_data.reserve(data.size()); - size_t size = offsets.size(); - res_offsets.resize(size); + res_data.reserve(haystack_data.size()); + const size_t haystack_size = haystack_offsets.size(); + res_offsets.resize(haystack_size); /// The current index in the array of strings. size_t i = 0; @@ -53,22 +64,22 @@ struct ReplaceStringImpl memcpy(&res_data[res_offset], pos, match - pos); /// Determine which index it belongs to. - while (i < offsets.size() && begin + offsets[i] <= match) + while (i < haystack_offsets.size() && begin + haystack_offsets[i] <= match) { - res_offsets[i] = res_offset + ((begin + offsets[i]) - pos); + res_offsets[i] = res_offset + ((begin + haystack_offsets[i]) - pos); ++i; } res_offset += (match - pos); /// If you have reached the end, it's time to stop - if (i == offsets.size()) + if (i == haystack_offsets.size()) break; /// Is it true that this string no longer needs to perform transformations. bool can_finish_current_string = false; /// We check that the entry does not go through the boundaries of strings. - if (match + needle.size() < begin + offsets[i]) + if (match + needle.size() < begin + haystack_offsets[i]) { res_data.resize(res_data.size() + replacement.size()); memcpy(&res_data[res_offset], replacement.data(), replacement.size()); @@ -85,34 +96,268 @@ struct ReplaceStringImpl if (can_finish_current_string) { - res_data.resize(res_data.size() + (begin + offsets[i] - pos)); - memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos)); - res_offset += (begin + offsets[i] - pos); + res_data.resize(res_data.size() + (begin + haystack_offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + haystack_offsets[i] - pos)); + res_offset += (begin + haystack_offsets[i] - pos); res_offsets[i] = res_offset; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } } } - /// Note: this function converts fixed-length strings to variable-length strings - /// and each variable-length string should ends with zero byte. - static void vectorFixed( - const ColumnString::Chars & data, - size_t n, - const std::string & needle, - const std::string & replacement, + template + requires (sizeof(CharT) == 1) + static void copyToOutput( + const CharT * what_start, size_t what_size, + ColumnString::Chars & output, size_t & output_offset) + { + output.resize(output.size() + what_size); + memcpy(&output[output_offset], what_start, what_size); + output_offset += what_size; + } + + static void vectorVectorConstant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - const UInt8 * begin = data.data(); - const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); + chassert(haystack_offsets.size() == needle_offsets.size()); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); ColumnString::Offset res_offset = 0; - size_t count = data.size() / n; - res_data.reserve(data.size()); - res_offsets.resize(count); + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; + + if (cur_needle_length == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row + StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(replacement.data(), replacement.size(), res_data, res_offset); + + last_match = match; + start_pos = match + cur_needle_length; + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - cur_needle_length + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_needle_offset = needle_offsets[i]; + } + } + + static void vectorConstantVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + chassert(haystack_offsets.size() == replacement_offsets.size()); + + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + ColumnString::Offset res_offset = 0; + + size_t prev_haystack_offset = 0; + size_t prev_replacement_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; + const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; + + /// Using "slow" "stdlib searcher instead of Volnitsky just to keep things simple + StdLibASCIIStringSearcher searcher(needle.data(), needle.size()); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); + + last_match = match; + start_pos = match + needle.size(); + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - needle.size() + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_replacement_offset = replacement_offsets[i]; + } + } + + static void vectorVectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars & replacement_data, + const ColumnString::Offsets & replacement_offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + chassert(haystack_offsets.size() == needle_offsets.size()); + chassert(needle_offsets.size() == replacement_offsets.size()); + + const size_t haystack_size = haystack_offsets.size(); + + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); + + ColumnString::Offset res_offset = 0; + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + size_t prev_replacement_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1; + + const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset]; + const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1; + + if (cur_needle_length == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + /// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row + StdLibASCIIStringSearcher searcher(cur_needle_data, cur_needle_length); + + const auto * last_match = static_cast(nullptr); + const auto * start_pos = cur_haystack_data; + const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length; + + while (start_pos < cur_haystack_end) + { + if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end) + { + /// Copy prefix before match + copyToOutput(start_pos, match - start_pos, res_data, res_offset); + + /// Insert replacement for match + copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset); + + last_match = match; + start_pos = match + cur_needle_length; + + if constexpr (replace == ReplaceStringTraits::Replace::First) + break; + } + else + break; + } + + /// Copy suffix after last match + size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1) + : (cur_haystack_end - last_match - cur_needle_length + 1); + copyToOutput(start_pos, bytes, res_data, res_offset); + + res_offsets[i] = res_offset; + + prev_haystack_offset = haystack_offsets[i]; + prev_needle_offset = needle_offsets[i]; + prev_replacement_offset = replacement_offsets[i]; + } + } + + /// Note: this function converts fixed-length strings to variable-length strings + /// and each variable-length string should ends with zero byte. + static void vectorFixedConstantConstant( + const ColumnString::Chars & haystack_data, + size_t n, + const String & needle, + const String & replacement, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + if (needle.empty()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name); + + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); + const UInt8 * pos = begin; + + ColumnString::Offset res_offset = 0; + size_t haystack_size = haystack_data.size() / n; + res_data.reserve(haystack_data.size()); + res_offsets.resize(haystack_size); /// The current index in the string array. size_t i = 0; @@ -139,13 +384,13 @@ struct ReplaceStringImpl /// Copy skipped strings without any changes but /// add zero byte to the end of each string. - while (i < count && begin + n * (i + 1) <= match) + while (i < haystack_size && begin + n * (i + 1) <= match) { COPY_REST_OF_CURRENT_STRING(); } /// If you have reached the end, it's time to stop - if (i == count) + if (i == haystack_size) break; /// Copy unchanged part of current string. diff --git a/src/Functions/replaceAll.cpp b/src/Functions/replaceAll.cpp index d85d192d199..6c06f5984b3 100644 --- a/src/Functions/replaceAll.cpp +++ b/src/Functions/replaceAll.cpp @@ -13,7 +13,7 @@ struct NameReplaceAll static constexpr auto name = "replaceAll"; }; -using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; +using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; } diff --git a/src/Functions/replaceOne.cpp b/src/Functions/replaceOne.cpp index 6557339537e..62be2906a71 100644 --- a/src/Functions/replaceOne.cpp +++ b/src/Functions/replaceOne.cpp @@ -13,7 +13,7 @@ struct NameReplaceOne static constexpr auto name = "replaceOne"; }; -using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; +using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; } diff --git a/src/Functions/replaceRegexpAll.cpp b/src/Functions/replaceRegexpAll.cpp index 4eaf46c05d4..f5f56fb0f35 100644 --- a/src/Functions/replaceRegexpAll.cpp +++ b/src/Functions/replaceRegexpAll.cpp @@ -13,7 +13,7 @@ struct NameReplaceRegexpAll static constexpr auto name = "replaceRegexpAll"; }; -using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; +using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; } diff --git a/src/Functions/replaceRegexpOne.cpp b/src/Functions/replaceRegexpOne.cpp index 60e29213a9a..fc3e55aa791 100644 --- a/src/Functions/replaceRegexpOne.cpp +++ b/src/Functions/replaceRegexpOne.cpp @@ -13,7 +13,7 @@ struct NameReplaceRegexpOne static constexpr auto name = "replaceRegexpOne"; }; -using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; +using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; } diff --git a/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference b/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference index 6a2a0523476..285b9a62d20 100644 --- a/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference +++ b/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference @@ -3,6 +3,7 @@ FOO foo FOO baz +zzz 2 fo oo diff --git a/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql b/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql index 4e16768b373..da0eb9bea6d 100644 --- a/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql +++ b/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql @@ -5,6 +5,7 @@ select ucase('foo'); select LOWER('Foo'); select UPPER('Foo'); select REPLACE('bar', 'r', 'z'); +select REGEXP_REPLACE('bar', '.', 'z'); select Locate('foo', 'o'); select SUBSTRING('foo', 1, 2); select Substr('foo', 2); diff --git a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference new file mode 100644 index 00000000000..c7a02045316 --- /dev/null +++ b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.reference @@ -0,0 +1,77 @@ +** replaceAll() ** +- non-const needle, const replacement +1 Hello World l x Hexxo Worxd +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hello World +5 Hello World . x Hello World +- const needle, non-const replacement +1 Hello World l xx Hexxxxo Worxxd +2 Hello World l x Hexxo Worxd +3 Hello World l x Hexxo Worxd +4 Hello World l x Hexxo Worxd +5 Hello World l x Hexxo Worxd +- non-const needle, non-const replacement +1 Hello World l xx Hexxxxo Worxxd +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hello World +5 Hello World . x Hello World +** replaceOne() ** +- non-const needle, const replacement +1 Hello World l x Hexlo World +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hello World +5 Hello World . x Hello World +- const needle, non-const replacement +1 Hello World l xx Hexxlo World +2 Hello World l x Hexlo World +3 Hello World l x Hexlo World +4 Hello World l x Hexlo World +5 Hello World l x Hexlo World +- non-const needle, non-const replacement +1 Hello World l xx Hexxlo World +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hello World +5 Hello World . x Hello World +** replaceRegexpAll() ** +- non-const needle, const replacement +1 Hello World l x Hexxo Worxd +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hxllx Wxrld +5 Hello World . x xxxxxxxxxxx +- const needle, non-const replacement +1 Hello World l xx Hexxxxo Worxxd +2 Hello World l x Hexxo Worxd +3 Hello World l x Hexxo Worxd +4 Hello World l x Hexxo Worxd +5 Hello World l x Hexxo Worxd +- non-const needle, non-const replacement +1 Hello World l xx Hexxxxo Worxxd +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hxllx Wxrld +5 Hello World . x xxxxxxxxxxx +** replaceRegexpOne() ** +- non-const needle, const replacement +1 Hello World l x Hexlo World +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hxllo World +5 Hello World . x xello World +- const needle, non-const replacement +1 Hello World l xx Hexxlo World +2 Hello World l x Hexlo World +3 Hello World l x Hexlo World +4 Hello World l x Hexlo World +5 Hello World l x Hexlo World +- non-const needle, non-const replacement +1 Hello World l xx Hexxlo World +2 Hello World ll x Hexo World +3 Hello World not_found x Hello World +4 Hello World [eo] x Hxllo World +5 Hello World . x xello World +Check that an exception is thrown if the needle is empty diff --git a/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql new file mode 100644 index 00000000000..7406f0309bb --- /dev/null +++ b/tests/queries/0_stateless/02536_replace_with_nonconst_needle_and_replacement.sql @@ -0,0 +1,90 @@ +-- Tests that functions replaceOne(), replaceAll(), replaceRegexpOne(), replaceRegexpAll() work with with non-const pattern and replacement arguments + +DROP TABLE IF EXISTS test_tab; + +CREATE TABLE test_tab + (id UInt32, haystack String, needle String, replacement String) + engine = MergeTree() + ORDER BY id; + +INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'xx') (2, 'Hello World', 'll', 'x') (3, 'Hello World', 'not_found', 'x') (4, 'Hello World', '[eo]', 'x') (5, 'Hello World', '.', 'x') + +SELECT '** replaceAll() **'; + +SELECT '- non-const needle, const replacement'; +SELECT id, haystack, needle, 'x', replaceAll(haystack, needle, 'x') FROM test_tab ORDER BY id; + +SELECT '- const needle, non-const replacement'; +SELECT id, haystack, 'l', replacement, replaceAll(haystack, 'l', replacement) FROM test_tab ORDER BY id; + +SELECT '- non-const needle, non-const replacement'; +SELECT id, haystack, needle, replacement, replaceAll(haystack, needle, replacement) FROM test_tab ORDER BY id; + +SELECT '** replaceOne() **'; + +SELECT '- non-const needle, const replacement'; +SELECT id, haystack, needle, 'x', replaceOne(haystack, needle, 'x') FROM test_tab ORDER BY id; + + +SELECT '- const needle, non-const replacement'; +SELECT id, haystack, 'l', replacement, replaceOne(haystack, 'l', replacement) FROM test_tab ORDER BY id; + + +SELECT '- non-const needle, non-const replacement'; +SELECT id, haystack, needle, replacement, replaceOne(haystack, needle, replacement) FROM test_tab ORDER BY id; + +SELECT '** replaceRegexpAll() **'; + +SELECT '- non-const needle, const replacement'; +SELECT id, haystack, needle, 'x', replaceRegexpAll(haystack, needle, 'x') FROM test_tab ORDER BY id; + +SELECT '- const needle, non-const replacement'; +SELECT id, haystack, 'l', replacement, replaceRegexpAll(haystack, 'l', replacement) FROM test_tab ORDER BY id; + +SELECT '- non-const needle, non-const replacement'; +SELECT id, haystack, needle, replacement, replaceRegexpAll(haystack, needle, replacement) FROM test_tab ORDER BY id; + +SELECT '** replaceRegexpOne() **'; + +SELECT '- non-const needle, const replacement'; +SELECT id, haystack, needle, 'x', replaceRegexpOne(haystack, needle, 'x') FROM test_tab ORDER BY id; + + +SELECT '- const needle, non-const replacement'; +SELECT id, haystack, 'l', replacement, replaceRegexpOne(haystack, 'l', replacement) FROM test_tab ORDER BY id; + + +SELECT '- non-const needle, non-const replacement'; +SELECT id, haystack, needle, replacement, replaceRegexpOne(haystack, needle, replacement) FROM test_tab ORDER BY id; + +DROP TABLE IF EXISTS test_tab; + + +SELECT 'Check that an exception is thrown if the needle is empty'; + +CREATE TABLE test_tab + (id UInt32, haystack String, needle String, replacement String) + engine = MergeTree() + ORDER BY id; + +INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y') + +-- needle: non-const, replacement: const +SELECT replaceAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } + +-- needle: const, replacement: non-const +SELECT replaceAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } + +-- needle: non-const, replacement: non-const +SELECT replaceAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND } + +DROP TABLE IF EXISTS test_tab;