From 1eed72b525aab4bcf074f4b7b1ed8e7e4dc36937 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 29 Jun 2022 10:37:42 +0000 Subject: [PATCH] Make more multi-search methods work with non-const needles After making function multi[Fuzzy]Match(Any|AnyIndex|AllIndices)() work with non-const needles, 12 more functions started to fail in test "00233_position_function_family": multiSearchAny() multiSearchAnyCaseInsensitive() multiSearchAnyUTF8 multiSearchAnyCaseInsensitiveUTF8() multiSearchFirstPosition() multiSearchFirstPositionCaseInsensitive() multiSearchFirstPositionUTF8() multiSearchFirstPositionCaseInsensitiveUTF8() multiSearchFirstIndex() multiSearchFirstIndexCaseInsensitive() multiSearchFirstIndexUTF8() multiSearchFirstIndexCaseInsensitiveUTF8() Failing queries take the form select 0 = multiSearchAny('\0', CAST([], 'Array(String)')); --- src/Functions/MultiMatchAllIndicesImpl.h | 5 +- src/Functions/MultiMatchAnyImpl.h | 5 +- src/Functions/MultiSearchFirstIndexImpl.h | 57 +++++++++++++-- src/Functions/MultiSearchFirstPositionImpl.h | 70 +++++++++++++++++-- src/Functions/MultiSearchImpl.h | 55 +++++++++++++-- ...tringsearch_with_nonconst_needle.reference | 24 +++++++ ...2294_stringsearch_with_nonconst_needle.sql | 34 +++++++++ 7 files changed, 223 insertions(+), 27 deletions(-) diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index 7ff2376593c..adba8817b33 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -165,9 +165,8 @@ struct MultiMatchAllIndicesImpl size_t prev_haystack_offset = 0; for (size_t i = 0; i < haystack_offsets.size(); ++i) { - Field field; - needles_col.get(i, field); - Array & needles_arr = DB::get(field); + Field field = needles_col[i]; + const Array & needles_arr = DB::get(field); std::vector needles; needles.reserve(needles_arr.size()); diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 24cf6b53f30..fa56a5f0924 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -192,9 +192,8 @@ struct MultiMatchAnyImpl size_t prev_haystack_offset = 0; for (size_t i = 0; i < haystack_offsets.size(); ++i) { - Field field; - needles_col.get(i, field); - Array & needles_arr = DB::get(field); + Field field = needles_col[i]; + const Array & needles_arr = DB::get(field); std::vector needles; needles.reserve(needles_arr.size()); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index 9e717aaff23..56ac06551f6 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -30,7 +30,7 @@ struct MultiSearchFirstIndexImpl const ColumnString::Offsets & haystack_offsets, const Array & needles_arr, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets, + PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) @@ -47,13 +47,15 @@ struct MultiSearchFirstIndexImpl needles.emplace_back(needle.get()); auto searcher = Impl::createMultiSearcherInBigHaystack(needles); - const size_t haystack_string_size = haystack_offsets.size(); - res.resize(haystack_string_size); + + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; - for (size_t j = 0; j < haystack_string_size; ++j) + for (size_t j = 0; j < haystack_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; @@ -68,10 +70,51 @@ struct MultiSearchFirstIndexImpl std::fill(res.begin(), res.end(), 0); } - template - static void vectorVector(Args &&...) + static void vectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnArray & needles_col, + PaddedPODArray & res, + PaddedPODArray & /*offsets*/, + bool /*allow_hyperscan*/, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name); + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + + size_t prev_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + Field field = needles_col[i]; + const Array & needles_arr = DB::get(field); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); + + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal + + const auto * const haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1; + + size_t iteration = 0; + while (searcher.hasMoreToSearch()) + { + if (iteration == 0 || res[i] == 0) + { + res[i] = searcher.searchOneFirstIndex(haystack, haystack_end); + } + ++iteration; + } + if (iteration == 0) + { + res[i] = 0; + } + prev_offset = haystack_offsets[i]; + } } }; diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 1f0cf1e6463..f1d79b2685d 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -30,7 +30,7 @@ struct MultiSearchFirstPositionImpl const ColumnString::Offsets & haystack_offsets, const Array & needles_arr, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets, + PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) @@ -51,13 +51,15 @@ struct MultiSearchFirstPositionImpl return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); }; auto searcher = Impl::createMultiSearcherInBigHaystack(needles); - const size_t haystack_string_size = haystack_offsets.size(); - res.resize(haystack_string_size); + + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; - for (size_t j = 0; j < haystack_string_size; ++j) + for (size_t j = 0; j < haystack_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; @@ -77,10 +79,64 @@ struct MultiSearchFirstPositionImpl std::fill(res.begin(), res.end(), 0); } - template - static void vectorVector(Args &&...) + static void vectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnArray & needles_col, + PaddedPODArray & res, + PaddedPODArray & /*offsets*/, + bool /*allow_hyperscan*/, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name); + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + + size_t prev_offset = 0; + + auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 + { + return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); + }; + + for (size_t i = 0; i < haystack_size; ++i) + { + Field field = needles_col[i]; + const Array & needles_arr = DB::get(field); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); + + auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal + + const auto * const haystack = &haystack_data[prev_offset]; + const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1; + + size_t iteration = 0; + while (searcher.hasMoreToSearch()) + { + if (iteration == 0 || res[i] == 0) + { + res[i] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback); + } + else + { + UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback); + if (result != 0) + { + res[i] = std::min(result, res[i]); + } + } + ++iteration; + } + if (iteration == 0) + { + res[i] = 0; + } + prev_offset = haystack_offsets[i]; + } } }; diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index 53406d9324d..86567625252 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -30,7 +30,7 @@ struct MultiSearchImpl const ColumnString::Offsets & haystack_offsets, const Array & needles_arr, PaddedPODArray & res, - [[maybe_unused]] PaddedPODArray & offsets, + PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, size_t /*max_hyperscan_regexp_total_length*/) @@ -47,13 +47,15 @@ struct MultiSearchImpl needles.emplace_back(needle.get()); auto searcher = Impl::createMultiSearcherInBigHaystack(needles); - const size_t haystack_string_size = haystack_offsets.size(); - res.resize(haystack_string_size); + + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; - for (size_t j = 0; j < haystack_string_size; ++j) + for (size_t j = 0; j < haystack_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; @@ -67,10 +69,49 @@ struct MultiSearchImpl std::fill(res.begin(), res.end(), 0); } - template - static void vectorVector(Args &&...) + static void vectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnArray & needles_col, + PaddedPODArray & res, + PaddedPODArray & /*offsets*/, + bool /*allow_hyperscan*/, + size_t /*max_hyperscan_regexp_length*/, + size_t /*max_hyperscan_regexp_total_length*/) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name); + const size_t haystack_size = haystack_offsets.size(); + res.resize(haystack_size); + + size_t prev_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const haystack = &haystack_data[prev_offset]; + const size_t haystack_length = haystack_offsets[i] - prev_offset - 1; + + Field field = needles_col[i]; + const Array & needles_arr = DB::get(field); + + std::vector needles; + needles.reserve(needles_arr.size()); + for (const auto & needle : needles_arr) + needles.emplace_back(needle.get()); + + size_t iteration = 0; + for (size_t j = 0; j < needles_arr.size(); ++j) + { + auto searcher = Impl::createSearcherInSmallHaystack(needles[j].data(), needles[j].size()); + if (iteration == 0 || !res[i]) + { + const auto * match = searcher.search(haystack, haystack_length); + res[i] = (match != haystack + haystack_length); + } + ++iteration; + } + if (iteration == 0) + res[i] = 0; + prev_offset = haystack_offsets[i]; + } } }; diff --git a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference index 7471bcad00c..76ebc1fb99e 100644 --- a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference +++ b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference @@ -188,3 +188,27 @@ MATCH 35 Hello .*ell.* 1 36 Hello o$ 1 37 Hello hE.*lO 0 +MULTISEARCHANY +1 +1 +1 +1 +1 +1 +1 +MULTISEARCHFIRSTINDEX +1 +1 +1 +1 +1 +1 +1 +MULTISEARCHFIRSTPOSITION +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql index 3057e342733..6dd4c4f396d 100644 --- a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql +++ b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql @@ -1,3 +1,5 @@ +-- tests of "(not) (i)like" functions + drop table if exists non_const_needle; create table non_const_needle @@ -34,3 +36,35 @@ select id, haystack, needle, match(haystack, needle) order by id; drop table if exists non_const_needle; + +-- rudimentary tests of "multiSearchFirstIndex()", "multiSearchAnyPosition()" and "multiSearchFirstIndex()" functions + +select 'MULTISEARCHANY'; +select multiSearchAny(materialize('Hello World'), materialize([])); -- { serverError 43 } +select 0 = multiSearchAny('Hello World', CAST([], 'Array(String)')); +select 1 = multiSearchAny(materialize('Hello World'), materialize(['orld'])); +select 0 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'Welt'])); +select 1 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'orld'])); +select 1 = multiSearchAnyCaseInsensitive(materialize('Hello World'), materialize(['WORLD'])); +select 1 = multiSearchAnyUTF8(materialize('Hello World £'), materialize(['WORLD', '£'])); +select 1 = multiSearchAnyCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD'])); + +select 'MULTISEARCHFIRSTINDEX'; +select multiSearchFirstIndex(materialize('Hello World'), materialize([])); -- { serverError 43 } +select 0 = multiSearchFirstIndex('Hello World', CAST([], 'Array(String)')); +select 1 = multiSearchFirstIndex(materialize('Hello World'), materialize(['orld'])); +select 0 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'Welt'])); +select 2 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'orld'])); +select 1 = multiSearchFirstIndexCaseInsensitive(materialize('Hello World'), materialize(['WORLD'])); +select 2 = multiSearchFirstIndexUTF8(materialize('Hello World £'), materialize(['WORLD', '£'])); +select 1 = multiSearchFirstIndexCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD'])); + +select 'MULTISEARCHFIRSTPOSITION'; +select multiSearchFirstPosition(materialize('Hello World'), materialize([])); -- { serverError 43 } +select 0 = multiSearchFirstPosition('Hello World', CAST([], 'Array(String)')); +select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['orld'])); +select 0 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'Welt'])); +select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'orld'])); +select 7 = multiSearchFirstPositionCaseInsensitive(materialize('Hello World'), materialize(['WORLD'])); +select 13 = multiSearchFirstPositionUTF8(materialize('Hello World £'), materialize(['WORLD', '£'])); +select 7 = multiSearchFirstPositionCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));