From 9ed7df64cd78ef17773dc7e7e1a14379847359b7 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 1 Aug 2020 21:14:23 +0000 Subject: [PATCH 1/7] Support 'start_pos' argument in 'position' function --- src/Functions/FunctionsStringSearch.h | 72 +++- src/Functions/FunctionsVisitParam.h | 9 +- src/Functions/HasTokenImpl.h | 11 +- src/Functions/MatchImpl.h | 14 +- src/Functions/PositionImpl.h | 139 ++++++-- .../00233_position_function_family.reference | 313 ++++++++++++++++++ .../00233_position_function_family.sql | 80 +++++ 7 files changed, 600 insertions(+), 38 deletions(-) diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index b3f016a55b1..9dee4ff062c 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -11,7 +11,6 @@ #include #include - namespace DB { /** Search and replace functions in strings: @@ -51,15 +50,26 @@ public: String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 2; } + bool isVariadic() const override { + return Impl::supports_start_pos; + } + + size_t getNumberOfArguments() const override { + if (Impl::supports_start_pos) { + return 0; + } + return 2; + } bool useDefaultImplementationForConstants() const override { return Impl::use_default_implementation_for_constants; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { - return Impl::use_default_implementation_for_constants - ? ColumnNumbers{1, 2} - : ColumnNumbers{}; + if (!Impl::use_default_implementation_for_constants) + return ColumnNumbers{}; + if (!Impl::supports_start_pos) + return ColumnNumbers{1, 2}; + return ColumnNumbers{1, 2, 3}; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override @@ -72,6 +82,13 @@ public: throw Exception( "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (arguments.size() >= 3) { + if (!isUnsignedInteger(arguments[2])) { + throw Exception( + "Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + } + return std::make_shared>(); } @@ -82,17 +99,36 @@ public: const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; + ColumnPtr column_start_pos = nullptr; + if (arguments.size() >= 3) { + column_start_pos = block.getByPosition(arguments[2]).column; + } + const ColumnConst * col_haystack_const = typeid_cast(&*column_haystack); const ColumnConst * col_needle_const = typeid_cast(&*column_needle); if constexpr (!Impl::use_default_implementation_for_constants) { + bool is_col_start_pos_const = column_start_pos == nullptr || isColumnConst(*column_start_pos); if (col_haystack_const && col_needle_const) { - ResultType res{}; - Impl::constantConstant(col_haystack_const->getValue(), col_needle_const->getValue(), res); - block.getByPosition(result).column - = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(is_col_start_pos_const ? 1 : column_start_pos->size()); + + Impl::constantConstant( + col_haystack_const->getValue(), + col_needle_const->getValue(), + column_start_pos, + vec_res); + + if (is_col_start_pos_const) { + block.getByPosition(result).column + = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(vec_res[0])); + } else { + block.getByPosition(result).column = std::move(col_res); + } + return; } } @@ -112,16 +148,28 @@ public: col_haystack_vector->getOffsets(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), + column_start_pos, vec_res); else if (col_haystack_vector && col_needle_const) Impl::vectorConstant( - col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue(), vec_res); + col_haystack_vector->getChars(), + col_haystack_vector->getOffsets(), + col_needle_const->getValue(), + column_start_pos, + vec_res); else if (col_haystack_vector_fixed && col_needle_const) Impl::vectorFixedConstant( - col_haystack_vector_fixed->getChars(), col_haystack_vector_fixed->getN(), col_needle_const->getValue(), vec_res); + col_haystack_vector_fixed->getChars(), + col_haystack_vector_fixed->getN(), + col_needle_const->getValue(), + vec_res); else if (col_haystack_const && col_needle_vector) Impl::constantVector( - col_haystack_const->getValue(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res); + col_haystack_const->getValue(), + col_needle_vector->getChars(), + col_needle_vector->getOffsets(), + column_start_pos, + vec_res); else throw Exception( "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and " diff --git a/src/Functions/FunctionsVisitParam.h b/src/Functions/FunctionsVisitParam.h index 02e55df6691..528d47a1e1c 100644 --- a/src/Functions/FunctionsVisitParam.h +++ b/src/Functions/FunctionsVisitParam.h @@ -79,12 +79,19 @@ struct ExtractParamImpl using ResultType = typename ParamExtractor::ResultType; static constexpr bool use_default_implementation_for_constants = true; + static constexpr bool supports_start_pos = false; /// It is assumed that `res` is the correct size and initialized with zeros. - static void vectorConstant(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, + static void vectorConstant( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, std::string needle, + const ColumnPtr & start_pos, PaddedPODArray & res) { + if (start_pos != nullptr) { + throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support start_pos argument", ErrorCodes::ILLEGAL_COLUMN); + } /// We are looking for a parameter simply as a substring of the form "name" needle = "\"" + needle + "\":"; diff --git a/src/Functions/HasTokenImpl.h b/src/Functions/HasTokenImpl.h index b8f250be69f..329f015624b 100644 --- a/src/Functions/HasTokenImpl.h +++ b/src/Functions/HasTokenImpl.h @@ -19,10 +19,19 @@ struct HasTokenImpl using ResultType = UInt8; static constexpr bool use_default_implementation_for_constants = true; + static constexpr bool supports_start_pos = false; static void vectorConstant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + const std::string & pattern, + const ColumnPtr & start_pos, + PaddedPODArray & res) { + if (start_pos != nullptr) { + throw Exception("Function 'hasToken' does not support start_pos argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + if (offsets.empty()) return; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 70c8419fcc9..ce69ff8a38d 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -76,6 +76,7 @@ template struct MatchImpl { static constexpr bool use_default_implementation_for_constants = true; + static constexpr bool supports_start_pos = false; using ResultType = UInt8; @@ -84,8 +85,16 @@ struct MatchImpl VolnitskyUTF8>; static void vectorConstant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + const std::string & pattern, + const ColumnPtr & start_pos, + PaddedPODArray & res) { + if (start_pos != nullptr) { + throw Exception("Functions 'like' and 'match' don't support start_pos argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + if (offsets.empty()) return; @@ -238,7 +247,8 @@ struct MatchImpl /// Very carefully crafted copy-paste. static void vectorFixedConstant( - const ColumnString::Chars & data, size_t n, const std::string & pattern, PaddedPODArray & res) + const ColumnString::Chars & data, size_t n, const std::string & pattern, + PaddedPODArray & res) { if (data.empty()) return; diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index bc0b2c7bcfb..d865c15fdf3 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -1,11 +1,11 @@ #include "FunctionsStringSearch.h" +#include #include #include #include #include - namespace DB { @@ -42,6 +42,10 @@ struct PositionCaseSensitiveASCII return MultiSearcherInBigHaystack(needles); } + static const char * advancePos(const char * pos, const char * end, size_t n) { + return std::min(pos + n, end); + } + /// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8). static size_t countChars(const char * begin, const char * end) { return end - begin; } @@ -73,6 +77,10 @@ struct PositionCaseInsensitiveASCII return MultiSearcherInBigHaystack(needles); } + static const char * advancePos(const char * pos, const char * end, size_t n) { + return std::min(pos + n, end); + } + static size_t countChars(const char * begin, const char * end) { return end - begin; } static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); } @@ -100,6 +108,19 @@ struct PositionCaseSensitiveUTF8 return MultiSearcherInBigHaystack(needles); } + static const char * advancePos(const char * pos, const char * end, size_t n) + { + for (auto it = pos; it != end; ++it) { + if (!UTF8::isContinuationOctet(static_cast(*it))) { + if (n == 0) { + return it; + } + n--; + } + } + return end; + } + static size_t countChars(const char * begin, const char * end) { size_t res = 0; @@ -134,13 +155,16 @@ struct PositionCaseInsensitiveUTF8 return MultiSearcherInBigHaystack(needles); } + static const char * advancePos(const char * pos, const char * end, size_t n) + { + // reuse implementation that doesn't depend on case + return PositionCaseSensitiveUTF8::advancePos(pos, end, n); + } + static size_t countChars(const char * begin, const char * end) { - size_t res = 0; - for (auto it = begin; it != end; ++it) - if (!UTF8::isContinuationOctet(static_cast(*it))) - ++res; - return res; + // reuse implementation that doesn't depend on case + return PositionCaseSensitiveUTF8::countChars(begin, end); } static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); } @@ -151,12 +175,17 @@ template struct PositionImpl { static constexpr bool use_default_implementation_for_constants = false; + static constexpr bool supports_start_pos = true; using ResultType = UInt64; /// Find one substring in many strings. static void vectorConstant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + const std::string & needle, + const ColumnPtr & start_pos, + PaddedPODArray & res) { const UInt8 * begin = data.data(); const UInt8 * pos = begin; @@ -176,13 +205,23 @@ struct PositionImpl res[i] = 0; ++i; } + auto start = start_pos != nullptr ? start_pos->getUInt(i) : 0; /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) - res[i] = 1 + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); - else + if (pos + needle.size() < begin + offsets[i]) { + auto res_pos = 1 + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); + if (res_pos < start) { + pos = reinterpret_cast(Impl::advancePos( + reinterpret_cast(pos), + reinterpret_cast(begin + offsets[i]), + start - res_pos)); + continue; + } + res[i] = res_pos; + } + else { res[i] = 0; - + } pos = begin + offsets[i]; ++i; } @@ -192,24 +231,64 @@ struct PositionImpl } /// Search for substring in string. - static void constantConstant(std::string data, std::string needle, UInt64 & res) + static void constantConstantScalar( + std::string data, + std::string needle, + UInt64 start_pos, + UInt64 & res) { - Impl::toLowerIfNeed(data); - Impl::toLowerIfNeed(needle); + auto start = std::max(start_pos, 1ul); - res = data.find(needle); + if (needle.size() == 0) { + size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size()); + res = start <= haystack_size + 1 ? start : 0; + return; + } + + size_t start_byte = Impl::advancePos(data.data(), data.data() + data.size(), start - 1) - data.data(); + res = data.find(needle, start_byte); if (res == std::string::npos) res = 0; else res = 1 + Impl::countChars(data.data(), data.data() + res); } + /// Search for substring in string starting from different positions. + static void constantConstant( + std::string data, + std::string needle, + const ColumnPtr & start_pos, + PaddedPODArray & res) + { + Impl::toLowerIfNeed(data); + Impl::toLowerIfNeed(needle); + + if (start_pos == nullptr) { + constantConstantScalar(data, needle, 0, res[0]); + return; + } + + size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size()); + + size_t size = start_pos != nullptr ? start_pos->size() : 0; + for (size_t i = 0; i < size; ++i) { + auto start = start_pos->getUInt(i); + + if (start > haystack_size + 1) { + res[i] = 0; + continue; + } + constantConstantScalar(data, needle, start, res[i]); + } + } + /// Search each time for a different single substring inside each time different string. static void vectorVector( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, + const ColumnPtr & start_pos, PaddedPODArray & res) { ColumnString::Offset prev_haystack_offset = 0; @@ -222,10 +301,15 @@ struct PositionImpl size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; - if (0 == needle_size) + auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; + + if (start > haystack_size + 1) { + res[i] = 0; + } + else if (0 == needle_size) { - /// An empty string is always at the very beginning of `haystack`. - res[i] = 1; + /// An empty string is always at any position in `haystack`. + res[i] = start; } else { @@ -234,8 +318,12 @@ struct PositionImpl reinterpret_cast(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end + const char * beg = Impl::advancePos( + reinterpret_cast(&haystack_data[prev_haystack_offset]), + reinterpret_cast(&haystack_data[haystack_offsets[i] - 1]), + start - 1); /// searcher returns a pointer to the found substring or to the end of `haystack`. - size_t pos = searcher.search(&haystack_data[prev_haystack_offset], &haystack_data[haystack_offsets[i] - 1]) + size_t pos = searcher.search(reinterpret_cast(beg), &haystack_data[haystack_offsets[i] - 1]) - &haystack_data[prev_haystack_offset]; if (pos != haystack_size) @@ -259,6 +347,7 @@ struct PositionImpl const String & haystack, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, + const ColumnPtr & start_pos, PaddedPODArray & res) { // NOTE You could use haystack indexing. But this is a rare case. @@ -271,17 +360,23 @@ struct PositionImpl { size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; - if (0 == needle_size) + auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; + + if (start > haystack.size() + 1) { + res[i] = 0; + } + else if (0 == needle_size) { - res[i] = 1; + res[i] = start; } else { typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack( reinterpret_cast(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1); + const char * beg = Impl::advancePos(haystack.data(), haystack.data() + haystack.size(), start - 1); size_t pos = searcher.search( - reinterpret_cast(haystack.data()), + reinterpret_cast(beg), reinterpret_cast(haystack.data()) + haystack.size()) - reinterpret_cast(haystack.data()); diff --git a/tests/queries/0_stateless/00233_position_function_family.reference b/tests/queries/0_stateless/00233_position_function_family.reference index 85cc0a8ff2b..e1b22676c5e 100644 --- a/tests/queries/0_stateless/00233_position_function_family.reference +++ b/tests/queries/0_stateless/00233_position_function_family.reference @@ -23441,3 +23441,316 @@ 1 1 1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/00233_position_function_family.sql b/tests/queries/0_stateless/00233_position_function_family.sql index 1bfd17310e1..94f25e16e83 100644 --- a/tests/queries/0_stateless/00233_position_function_family.sql +++ b/tests/queries/0_stateless/00233_position_function_family.sql @@ -6,6 +6,23 @@ select 1 = position('abc', 'abc'); select 2 = position('abc', 'bc'); select 3 = position('abc', 'c'); +select 1 = position('', '', 0); +select 1 = position('', '', 1); +select 0 = position('', '', 2); +select 1 = position('a', '', 1); +select 2 = position('a', '', 2); +select 0 = position('a', '', 3); + +select [1, 1, 2, 3, 4, 5, 0, 0, 0, 0] = groupArray(position('aaaa', '', number)) from numbers(10); +select [1, 1, 2, 3, 4, 5, 0, 0, 0, 0] = groupArray(position(materialize('aaaa'), '', number)) from numbers(10); +select [1, 1, 2, 3, 4, 5, 0, 0, 0, 0] = groupArray(position('aaaa', materialize(''), number)) from numbers(10); +select [1, 1, 2, 3, 4, 5, 0, 0, 0, 0] = groupArray(position(materialize('aaaa'), materialize(''), number)) from numbers(10); + +select [1, 1, 2, 3, 4, 0, 0, 0, 0, 0] = groupArray(position('aaaa', 'a', number)) from numbers(10); +select [1, 1, 2, 3, 4, 0, 0, 0, 0, 0] = groupArray(position(materialize('aaaa'), 'a', number)) from numbers(10); +select [1, 1, 2, 3, 4, 0, 0, 0, 0, 0] = groupArray(position('aaaa', materialize('a'), number)) from numbers(10); +select [1, 1, 2, 3, 4, 0, 0, 0, 0, 0] = groupArray(position(materialize('aaaa'), materialize('a'), number)) from numbers(10); + select 1 = position(materialize(''), ''); select 1 = position(materialize('abc'), ''); select 0 = position(materialize(''), 'abc'); @@ -27,6 +44,16 @@ select 1 = position('абв', 'абв'); select 3 = position('абв', 'бв'); select 5 = position('абв', 'в'); +select 2 = position('abcabc', 'b', 0); +select 2 = position('abcabc', 'b', 1); +select 2 = position('abcabc', 'b', 2); +select 5 = position('abcabc', 'b', 3); +select 5 = position('abcabc', 'b', 4); +select 5 = position('abcabc', 'b', 5); +select 0 = position('abcabc', 'b', 6); +select 2 = position('abcabc', 'bca', 0); +select 0 = position('abcabc', 'bca', 3); + select 1 = position(materialize(''), ''); select 1 = position(materialize('абв'), ''); select 0 = position(materialize(''), 'абв'); @@ -48,6 +75,14 @@ select 1 = positionUTF8('абв', 'абв'); select 2 = positionUTF8('абв', 'бв'); select 3 = positionUTF8('абв', 'в'); +select 3 = position('абвабв', 'б', 2); +select 3 = position('абвабв', 'б', 3); +select 3 = position('абвабв', 'бва', 2); +select 9 = position('абвабв', 'б', 4); +select 0 = position('абвабв', 'бва', 4); +select 5 = position('абвабв', 'в', 0); +select 11 = position('абвабв', 'в', 6); + select 1 = positionUTF8(materialize(''), ''); select 1 = positionUTF8(materialize('абв'), ''); select 0 = positionUTF8(materialize(''), 'абв'); @@ -62,6 +97,51 @@ select 1 = positionUTF8(materialize('абв'), 'абв') from system.numbers lim select 2 = positionUTF8(materialize('абв'), 'бв') from system.numbers limit 10; select 3 = positionUTF8(materialize('абв'), 'в') from system.numbers limit 10; +select 2 = positionUTF8('абвабв', 'б', 0); +select 2 = positionUTF8('абвабв', 'б', 1); +select 2 = positionUTF8('абвабв', 'б', 2); +select 5 = positionUTF8('абвабв', 'б', 3); +select 5 = positionUTF8('абвабв', 'б', 4); +select 5 = positionUTF8('абвабв', 'б', 5); +select 0 = positionUTF8('абвабв', 'б', 6); +select 2 = positionUTF8('абвабв', 'бва', 0); +select 0 = positionUTF8('абвабв', 'бва', 3); + +select 2 = positionUTF8(materialize('абвабв'), 'б', 0) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), 'б', 1) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), 'б', 2) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), 'б', 3) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), 'б', 4) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), 'б', 5) from system.numbers limit 10; +select 0 = positionUTF8(materialize('абвабв'), 'б', 6) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), 'бва', 0) from system.numbers limit 10; +select 0 = positionUTF8(materialize('абвабв'), 'бва', 3) from system.numbers limit 10; + +select 2 = positionUTF8('абвабв', materialize('б'), 0) from system.numbers limit 10; +select 2 = positionUTF8('абвабв', materialize('б'), 1) from system.numbers limit 10; +select 2 = positionUTF8('абвабв', materialize('б'), 2) from system.numbers limit 10; +select 5 = positionUTF8('абвабв', materialize('б'), 3) from system.numbers limit 10; +select 5 = positionUTF8('абвабв', materialize('б'), 4) from system.numbers limit 10; +select 5 = positionUTF8('абвабв', materialize('б'), 5) from system.numbers limit 10; +select 0 = positionUTF8('абвабв', materialize('б'), 6) from system.numbers limit 10; +select 2 = positionUTF8('абвабв', materialize('бва'), 0) from system.numbers limit 10; +select 0 = positionUTF8('абвабв', materialize('бва'), 3) from system.numbers limit 10; + +select 2 = positionUTF8(materialize('абвабв'), materialize('б'), 0) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), materialize('б'), 1) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), materialize('б'), 2) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), materialize('б'), 3) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), materialize('б'), 4) from system.numbers limit 10; +select 5 = positionUTF8(materialize('абвабв'), materialize('б'), 5) from system.numbers limit 10; +select 0 = positionUTF8(materialize('абвабв'), materialize('б'), 6) from system.numbers limit 10; +select 2 = positionUTF8(materialize('абвабв'), materialize('бва'), 0) from system.numbers limit 10; +select 0 = positionUTF8(materialize('абвабв'), materialize('бва'), 3) from system.numbers limit 10; + +select [2, 2, 2, 5, 5, 5, 0, 0, 0, 0] = groupArray(positionUTF8(materialize('абвабв'), materialize('б'), number)) from numbers(10); +select [2, 2, 2, 5, 5, 5, 0, 0, 0, 0] = groupArray(positionUTF8('абвабв', materialize('б'), number)) from numbers(10); +select [2, 2, 2, 5, 5, 5, 0, 0, 0, 0] = groupArray(positionUTF8('абвабв', 'б', number)) from numbers(10); +select [2, 2, 2, 5, 5, 5, 0, 0, 0, 0] = groupArray(positionUTF8(materialize('абвабв'), 'б', number)) from numbers(10); + select 1 = positionCaseInsensitive('', ''); select 1 = positionCaseInsensitive('abc', ''); select 0 = positionCaseInsensitive('', 'aBc'); From 32f26bcde77b17f595b7e59785b31345abd24e2a Mon Sep 17 00:00:00 2001 From: vdimir Date: Sun, 2 Aug 2020 13:29:10 +0000 Subject: [PATCH 2/7] Add start_pos argument for position to documentation, case insensitive tests --- .../functions/string-search-functions.md | 26 +++++++++--- .../functions/string-search-functions.md | 14 ++++--- .../functions/string-search-functions.md | 14 ++++--- .../functions/string-search-functions.md | 14 ++++--- .../functions/string-search-functions.md | 14 ++++--- .../functions/string-search-functions.md | 14 ++++--- .../00233_position_function_family.reference | 42 +++++++++++++++++++ .../00233_position_function_family.sql | 8 ++++ 8 files changed, 116 insertions(+), 30 deletions(-) diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 067644c30b2..a625af14505 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -21,15 +21,16 @@ For a case-insensitive search, use the function [positionCaseInsensitive](#posit **Syntax** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -Alias: `locate(haystack, needle)`. +Alias: `locate(haystack, needle[, start_pos])`. **Parameters** - `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned values** @@ -56,6 +57,18 @@ Result: └────────────────────────────────┘ ``` +``` sql +SELECT + position('Hello, world!', 'o', 1), + position('Hello, world!', 'o', 7) +``` + +``` text +┌─position('Hello, world!', 'o', 1)─┬─position('Hello, world!', 'o', 7)─┐ +│ 5 │ 9 │ +└───────────────────────────────────┴───────────────────────────────────┘ +``` + The same phrase in Russian contains characters which can’t be represented using a single byte. The function returns some unexpected result (use [positionUTF8](#positionutf8) function for multi-byte encoded text): Query: @@ -81,13 +94,14 @@ Works under the assumption that the string contains a set of bytes representing **Syntax** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **Parameters** - `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned values** @@ -123,13 +137,14 @@ For a case-insensitive search, use the function [positionCaseInsensitiveUTF8](#p **Syntax** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **Parameters** - `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned values** @@ -195,13 +210,14 @@ Works under the assumption that the string contains a set of bytes representing **Syntax** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **Parameters** - `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Returned value** diff --git a/docs/es/sql-reference/functions/string-search-functions.md b/docs/es/sql-reference/functions/string-search-functions.md index 3236745b22c..c448872a186 100644 --- a/docs/es/sql-reference/functions/string-search-functions.md +++ b/docs/es/sql-reference/functions/string-search-functions.md @@ -20,15 +20,16 @@ Para una búsqueda sin distinción de mayúsculas y minúsculas, utilice la func **Sintaxis** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -Apodo: `locate(haystack, needle)`. +Apodo: `locate(haystack, needle[, start_pos])`. **Parámetros** - `haystack` — string, in which substring will to be searched. [Cadena](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Cadena](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Valores devueltos** @@ -80,13 +81,14 @@ Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que rep **Sintaxis** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **Parámetros** - `haystack` — string, in which substring will to be searched. [Cadena](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Cadena](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Valores devueltos** @@ -122,13 +124,14 @@ Para una búsqueda sin distinción de mayúsculas y minúsculas, utilice la func **Sintaxis** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **Parámetros** - `haystack` — string, in which substring will to be searched. [Cadena](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Cadena](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Valores devueltos** @@ -194,13 +197,14 @@ Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que rep **Sintaxis** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **Parámetros** - `haystack` — string, in which substring will to be searched. [Cadena](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Cadena](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Valor devuelto** diff --git a/docs/fa/sql-reference/functions/string-search-functions.md b/docs/fa/sql-reference/functions/string-search-functions.md index af68dee0afa..cce6f8f5a4e 100644 --- a/docs/fa/sql-reference/functions/string-search-functions.md +++ b/docs/fa/sql-reference/functions/string-search-functions.md @@ -21,15 +21,16 @@ toc_title: "\u0628\u0631\u0627\u06CC \u062C\u0633\u062A\u062C\u0648\u06CC \u0631 **نحو** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -نام مستعار: `locate(haystack, needle)`. +نام مستعار: `locate(haystack, needle[, start_pos])`. **پارامترها** - `haystack` — string, in which substring will to be searched. [رشته](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [رشته](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **مقادیر بازگشتی** @@ -81,13 +82,14 @@ SELECT position('Привет, мир!', '!') **نحو** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **پارامترها** - `haystack` — string, in which substring will to be searched. [رشته](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [رشته](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **مقادیر بازگشتی** @@ -123,13 +125,14 @@ SELECT positionCaseInsensitive('Hello, world!', 'hello') **نحو** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **پارامترها** - `haystack` — string, in which substring will to be searched. [رشته](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [رشته](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **مقادیر بازگشتی** @@ -195,13 +198,14 @@ SELECT positionUTF8('Salut, étudiante!', '!') **نحو** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **پارامترها** - `haystack` — string, in which substring will to be searched. [رشته](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [رشته](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **مقدار بازگشتی** diff --git a/docs/ja/sql-reference/functions/string-search-functions.md b/docs/ja/sql-reference/functions/string-search-functions.md index 00f68c061dd..e5858ba4941 100644 --- a/docs/ja/sql-reference/functions/string-search-functions.md +++ b/docs/ja/sql-reference/functions/string-search-functions.md @@ -20,15 +20,16 @@ toc_title: "\u6587\u5B57\u5217\u3092\u691C\u7D22\u3059\u308B\u5834\u5408" **構文** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -別名: `locate(haystack, needle)`. +別名: `locate(haystack, needle[, start_pos])`. **パラメータ** - `haystack` — string, in which substring will to be searched. [文字列](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [文字列](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **戻り値** @@ -80,13 +81,14 @@ SELECT position('Привет, мир!', '!') **構文** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **パラメータ** - `haystack` — string, in which substring will to be searched. [文字列](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [文字列](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **戻り値** @@ -122,13 +124,14 @@ SELECT positionCaseInsensitive('Hello, world!', 'hello') **構文** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **パラメータ** - `haystack` — string, in which substring will to be searched. [文字列](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [文字列](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **戻り値** @@ -194,13 +197,14 @@ SELECT positionUTF8('Salut, étudiante!', '!') **構文** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **パラメータ** - `haystack` — string, in which substring will to be searched. [文字列](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [文字列](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **戻り値** diff --git a/docs/ru/sql-reference/functions/string-search-functions.md b/docs/ru/sql-reference/functions/string-search-functions.md index b363211d6d0..de713031046 100644 --- a/docs/ru/sql-reference/functions/string-search-functions.md +++ b/docs/ru/sql-reference/functions/string-search-functions.md @@ -15,15 +15,16 @@ **Синтаксис** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -Алиас: `locate(haystack, needle)`. +Алиас: `locate(haystack, needle[, start_pos])`. **Параметры** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) **Возвращаемые значения** @@ -75,13 +76,14 @@ SELECT position('Привет, мир!', '!') **Синтаксис** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **Параметры** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) **Возвращаемые значения** @@ -117,13 +119,14 @@ SELECT positionCaseInsensitive('Hello, world!', 'hello') **Синтаксис** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **Параметры** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) **Возвращаемые значения** @@ -189,13 +192,14 @@ SELECT positionUTF8('Salut, étudiante!', '!') **Синтаксис** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **Параметры** - `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). - `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `start_pos` – Опциональный параметр, позиция символа в строке, с которого начинается поиск. [UInt](../../sql-reference/data-types/int-uint.md) **Возвращаемые значения** diff --git a/docs/tr/sql-reference/functions/string-search-functions.md b/docs/tr/sql-reference/functions/string-search-functions.md index be30510ef2a..b80df910972 100644 --- a/docs/tr/sql-reference/functions/string-search-functions.md +++ b/docs/tr/sql-reference/functions/string-search-functions.md @@ -20,15 +20,16 @@ Büyük / küçük harf duyarsız arama için işlevi kullanın [positionCaseİn **Sözdizimi** ``` sql -position(haystack, needle) +position(haystack, needle[, start_pos]) ``` -Takma ad: `locate(haystack, needle)`. +Takma ad: `locate(haystack, needle[, start_pos])`. **Parametre** - `haystack` — string, in which substring will to be searched. [Dize](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Dize](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Döndürülen değerler** @@ -80,13 +81,14 @@ Dize, tek baytlık kodlanmış bir metni temsil eden bir bayt kümesi içerdiği **Sözdizimi** ``` sql -positionCaseInsensitive(haystack, needle) +positionCaseInsensitive(haystack, needle[, start_pos]) ``` **Parametre** - `haystack` — string, in which substring will to be searched. [Dize](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Dize](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Döndürülen değerler** @@ -122,13 +124,14 @@ Büyük / küçük harf duyarsız arama için işlevi kullanın [positionCaseİn **Sözdizimi** ``` sql -positionUTF8(haystack, needle) +positionUTF8(haystack, needle[, start_pos]) ``` **Parametre** - `haystack` — string, in which substring will to be searched. [Dize](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Dize](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Döndürülen değerler** @@ -194,13 +197,14 @@ Dizenin UTF-8 kodlanmış bir metni temsil eden bir bayt kümesi içerdiği vars **Sözdizimi** ``` sql -positionCaseInsensitiveUTF8(haystack, needle) +positionCaseInsensitiveUTF8(haystack, needle[, start_pos]) ``` **Parametre** - `haystack` — string, in which substring will to be searched. [Dize](../syntax.md#syntax-string-literal). - `needle` — substring to be searched. [Dize](../syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) **Döndürülen değer** diff --git a/tests/queries/0_stateless/00233_position_function_family.reference b/tests/queries/0_stateless/00233_position_function_family.reference index e1b22676c5e..1523094261f 100644 --- a/tests/queries/0_stateless/00233_position_function_family.reference +++ b/tests/queries/0_stateless/00233_position_function_family.reference @@ -23754,3 +23754,45 @@ 1 1 1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/00233_position_function_family.sql b/tests/queries/0_stateless/00233_position_function_family.sql index 94f25e16e83..8e443f5cd10 100644 --- a/tests/queries/0_stateless/00233_position_function_family.sql +++ b/tests/queries/0_stateless/00233_position_function_family.sql @@ -163,6 +163,10 @@ select 1 = positionCaseInsensitive(materialize('abc'), 'aBc') from system.number select 2 = positionCaseInsensitive(materialize('abc'), 'Bc') from system.numbers limit 10; select 3 = positionCaseInsensitive(materialize('abc'), 'C') from system.numbers limit 10; +select 6 = positionCaseInsensitive(materialize('abcabc'), 'C', 4); +select 6 = positionCaseInsensitive(materialize('abcabc'), 'C', 4) from system.numbers limit 10; +select 6 = positionCaseInsensitive(materialize('abcabc'), 'C', materialize(4)) from system.numbers limit 10; + select 1 = positionCaseInsensitive('', ''); select 1 = positionCaseInsensitive('абв', ''); select 0 = positionCaseInsensitive('', 'аБв'); @@ -205,6 +209,10 @@ select 1 = positionCaseInsensitiveUTF8(materialize('абв'), 'аБв') from sys select 2 = positionCaseInsensitiveUTF8(materialize('абв'), 'Бв') from system.numbers limit 10; select 3 = positionCaseInsensitiveUTF8(materialize('абв'), 'В') from system.numbers limit 10; +select 6 = positionCaseInsensitiveUTF8(materialize('абвабв'), 'В', 4); +select 6 = positionCaseInsensitiveUTF8(materialize('абвабв'), 'В', 4) from system.numbers limit 10; +select 6 = positionCaseInsensitiveUTF8(materialize('абвабв'), 'В', materialize(4)) from system.numbers limit 10; + select position('' as h, '' as n) = positionCaseInsensitive(h, n); select position('abc' as h, '' as n) = positionCaseInsensitive(n, n); select 0 = positionCaseInsensitive('', 'aBc'); From 368314b9307ad54a5667e9dd4cf95cb633495af8 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sun, 2 Aug 2020 14:24:39 +0000 Subject: [PATCH 3/7] Fix style in PositionImpl and FunctionsStringSearch --- src/Functions/FunctionsStringSearch.h | 24 +++++++-------- src/Functions/FunctionsVisitParam.h | 4 +-- src/Functions/HasTokenImpl.h | 4 +-- src/Functions/MatchImpl.h | 4 +-- src/Functions/PositionImpl.h | 42 +++++++++++++++++---------- 5 files changed, 43 insertions(+), 35 deletions(-) diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index 9dee4ff062c..39202507d1a 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -50,14 +50,12 @@ public: String getName() const override { return name; } - bool isVariadic() const override { - return Impl::supports_start_pos; - } + bool isVariadic() const override { return Impl::supports_start_pos; } - size_t getNumberOfArguments() const override { - if (Impl::supports_start_pos) { + size_t getNumberOfArguments() const override + { + if (Impl::supports_start_pos) return 0; - } return 2; } @@ -82,11 +80,11 @@ public: throw Exception( "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (arguments.size() >= 3) { - if (!isUnsignedInteger(arguments[2])) { + if (arguments.size() >= 3) + { + if (!isUnsignedInteger(arguments[2])) throw Exception( "Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - } } return std::make_shared>(); @@ -100,9 +98,8 @@ public: const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; ColumnPtr column_start_pos = nullptr; - if (arguments.size() >= 3) { + if (arguments.size() >= 3) column_start_pos = block.getByPosition(arguments[2]).column; - } const ColumnConst * col_haystack_const = typeid_cast(&*column_haystack); const ColumnConst * col_needle_const = typeid_cast(&*column_needle); @@ -122,12 +119,11 @@ public: column_start_pos, vec_res); - if (is_col_start_pos_const) { + if (is_col_start_pos_const) block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(vec_res[0])); - } else { + else block.getByPosition(result).column = std::move(col_res); - } return; } diff --git a/src/Functions/FunctionsVisitParam.h b/src/Functions/FunctionsVisitParam.h index 528d47a1e1c..047465354c0 100644 --- a/src/Functions/FunctionsVisitParam.h +++ b/src/Functions/FunctionsVisitParam.h @@ -89,9 +89,9 @@ struct ExtractParamImpl const ColumnPtr & start_pos, PaddedPODArray & res) { - if (start_pos != nullptr) { + if (start_pos != nullptr) throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support start_pos argument", ErrorCodes::ILLEGAL_COLUMN); - } + /// We are looking for a parameter simply as a substring of the form "name" needle = "\"" + needle + "\":"; diff --git a/src/Functions/HasTokenImpl.h b/src/Functions/HasTokenImpl.h index 329f015624b..11256b40933 100644 --- a/src/Functions/HasTokenImpl.h +++ b/src/Functions/HasTokenImpl.h @@ -8,6 +8,7 @@ namespace DB namespace ErrorCodes { + extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; } @@ -28,9 +29,8 @@ struct HasTokenImpl const ColumnPtr & start_pos, PaddedPODArray & res) { - if (start_pos != nullptr) { + if (start_pos != nullptr) throw Exception("Function 'hasToken' does not support start_pos argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - } if (offsets.empty()) return; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index ce69ff8a38d..ce653cd342c 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -24,6 +24,7 @@ namespace DB namespace ErrorCodes { + extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; } @@ -91,9 +92,8 @@ struct MatchImpl const ColumnPtr & start_pos, PaddedPODArray & res) { - if (start_pos != nullptr) { + if (start_pos != nullptr) throw Exception("Functions 'like' and 'match' don't support start_pos argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - } if (offsets.empty()) return; diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index d865c15fdf3..922665517dc 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -42,7 +42,8 @@ struct PositionCaseSensitiveASCII return MultiSearcherInBigHaystack(needles); } - static const char * advancePos(const char * pos, const char * end, size_t n) { + static const char * advancePos(const char * pos, const char * end, size_t n) + { return std::min(pos + n, end); } @@ -77,7 +78,8 @@ struct PositionCaseInsensitiveASCII return MultiSearcherInBigHaystack(needles); } - static const char * advancePos(const char * pos, const char * end, size_t n) { + static const char * advancePos(const char * pos, const char * end, size_t n) + { return std::min(pos + n, end); } @@ -110,11 +112,12 @@ struct PositionCaseSensitiveUTF8 static const char * advancePos(const char * pos, const char * end, size_t n) { - for (auto it = pos; it != end; ++it) { - if (!UTF8::isContinuationOctet(static_cast(*it))) { - if (n == 0) { + for (auto it = pos; it != end; ++it) + { + if (!UTF8::isContinuationOctet(static_cast(*it))) + { + if (n == 0) return it; - } n--; } } @@ -208,9 +211,11 @@ struct PositionImpl auto start = start_pos != nullptr ? start_pos->getUInt(i) : 0; /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) { + if (pos + needle.size() < begin + offsets[i]) + { auto res_pos = 1 + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); - if (res_pos < start) { + if (res_pos < start) + { pos = reinterpret_cast(Impl::advancePos( reinterpret_cast(pos), reinterpret_cast(begin + offsets[i]), @@ -219,7 +224,8 @@ struct PositionImpl } res[i] = res_pos; } - else { + else + { res[i] = 0; } pos = begin + offsets[i]; @@ -239,7 +245,8 @@ struct PositionImpl { auto start = std::max(start_pos, 1ul); - if (needle.size() == 0) { + if (needle.size() == 0) + { size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size()); res = start <= haystack_size + 1 ? start : 0; return; @@ -263,7 +270,8 @@ struct PositionImpl Impl::toLowerIfNeed(data); Impl::toLowerIfNeed(needle); - if (start_pos == nullptr) { + if (start_pos == nullptr) + { constantConstantScalar(data, needle, 0, res[0]); return; } @@ -271,10 +279,12 @@ struct PositionImpl size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size()); size_t size = start_pos != nullptr ? start_pos->size() : 0; - for (size_t i = 0; i < size; ++i) { + for (size_t i = 0; i < size; ++i) + { auto start = start_pos->getUInt(i); - if (start > haystack_size + 1) { + if (start > haystack_size + 1) + { res[i] = 0; continue; } @@ -303,7 +313,8 @@ struct PositionImpl auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; - if (start > haystack_size + 1) { + if (start > haystack_size + 1) + { res[i] = 0; } else if (0 == needle_size) @@ -362,7 +373,8 @@ struct PositionImpl auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; - if (start > haystack.size() + 1) { + if (start > haystack.size() + 1) + { res[i] = 0; } else if (0 == needle_size) From 7260009978e7bde7cab5fb2b4d5ca710c5cf4b85 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 4 Aug 2020 10:05:16 +0300 Subject: [PATCH 4/7] Fix argument handling in string search functions --- src/Functions/FunctionsStringSearch.h | 9 ++++++++- src/Functions/FunctionsVisitParam.h | 3 ++- src/Functions/HasTokenImpl.h | 2 +- src/Functions/MatchImpl.h | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index 39202507d1a..b890c9e428d 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -37,8 +38,9 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } template @@ -72,6 +74,11 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { + if (arguments.size() < 2 || 3 < arguments.size()) + throw Exception("Number of arguments for function " + String(Name::name) + " doesn't match: passed " + + toString(arguments.size()) + ", should be 2 or 3.", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + if (!isStringOrFixedString(arguments[0])) throw Exception( "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); diff --git a/src/Functions/FunctionsVisitParam.h b/src/Functions/FunctionsVisitParam.h index 047465354c0..8ba7d423899 100644 --- a/src/Functions/FunctionsVisitParam.h +++ b/src/Functions/FunctionsVisitParam.h @@ -36,6 +36,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; } @@ -90,7 +91,7 @@ struct ExtractParamImpl PaddedPODArray & res) { if (start_pos != nullptr) - throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support start_pos argument", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support start_pos argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); /// We are looking for a parameter simply as a substring of the form "name" needle = "\"" + needle + "\":"; diff --git a/src/Functions/HasTokenImpl.h b/src/Functions/HasTokenImpl.h index 11256b40933..044c50b6742 100644 --- a/src/Functions/HasTokenImpl.h +++ b/src/Functions/HasTokenImpl.h @@ -8,8 +8,8 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; } /** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation. diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index ce653cd342c..54ceb05645d 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -24,8 +24,8 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; } From 1ac1955ffba907130392a2b36a274a7a314ad79b Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 5 Aug 2020 14:32:41 +0300 Subject: [PATCH 5/7] bump CI --- src/Functions/PositionImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index 922665517dc..2cdf4004ce3 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -361,7 +361,7 @@ struct PositionImpl const ColumnPtr & start_pos, PaddedPODArray & res) { - // NOTE You could use haystack indexing. But this is a rare case. + /// NOTE You could use haystack indexing. But this is a rare case. ColumnString::Offset prev_needle_offset = 0; From 8fc470d1c588ec59713f7cfbd8091f9dda6d4906 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 15 Aug 2020 18:08:00 +0000 Subject: [PATCH 6/7] Fix compilcation errror in MultiMatchAnyImpl.h --- src/Functions/MultiMatchAnyImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 78c314fed69..c8d0ea6c87f 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -120,7 +120,7 @@ struct MultiMatchAnyImpl memset(accum.data(), 0, accum.size()); for (size_t j = 0; j < needles.size(); ++j) { - MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), accum); + MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); for (size_t i = 0; i < res.size(); ++i) { if constexpr (FindAny) From df63ada6ccd9c7751e0c63fa494bc90a05f08224 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 15 Aug 2020 20:08:59 +0000 Subject: [PATCH 7/7] Fix Darwin build (std::max candidate template) --- src/Functions/PositionImpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index 2cdf4004ce3..f5d8bd84c3f 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -243,7 +243,7 @@ struct PositionImpl UInt64 start_pos, UInt64 & res) { - auto start = std::max(start_pos, 1ul); + auto start = std::max(start_pos, UInt64(1)); if (needle.size() == 0) { @@ -311,7 +311,7 @@ struct PositionImpl size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; - auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; + auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), UInt64(1)) : UInt64(1); if (start > haystack_size + 1) { @@ -371,7 +371,7 @@ struct PositionImpl { size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; - auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), 1ul) : 1ul; + auto start = start_pos != nullptr ? std::max(start_pos->getUInt(i), UInt64(1)) : UInt64(1); if (start > haystack.size() + 1) {