From 81bb2242fdd2c98af7ad0dcd66d3b2275f0aadba Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 29 Jun 2022 15:08:16 +0000
Subject: [PATCH] Fix countSubstrings() & position() on patterns with 0-bytes

SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.

    create table t (id UInt32, pattern String) engine = MergeTree() order by id;
    insert into t values (1, 'x');
    select countSubstrings('aaaxxxaa\0xxx', pattern) from t;

We returned 3 before this commit, now we return 6
---
 src/Common/StringSearcher.h                   | 69 +++++++------------
 src/Functions/PositionImpl.h                  |  6 +-
 ...sition_countsubstrings_zero_byte.reference | 12 ++++
 ...346_position_countsubstrings_zero_byte.sql | 24 +++++++
 4 files changed, 62 insertions(+), 49 deletions(-)
 create mode 100644 tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference
 create mode 100644 tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql
diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h
index a82115a9923..7d669ddd369 100644
--- a/src/Common/StringSearcher.h
+++ b/src/Common/StringSearcher.h
@@ -826,66 +826,43 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
 using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
 using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
 
-
-/** Uses functions from libc.
-  * It makes sense to use only with short haystacks when cheap initialization is required.
-  * There is no option for case-insensitive search for UTF-8 strings.
-  * It is required that strings are zero-terminated.
-  */
-
-struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
+/// Use only with short haystacks where cheap initialization is required.
+template <bool CaseInsensitive>
+struct StdLibASCIIStringSearcher : public StringSearcherBase
 {
-    const char * const needle;
+    const char * const needle_start;
+    const char * const needle_end;
 
     template <typename CharT>
     requires (sizeof(CharT) == 1)
-    LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
-        : needle(reinterpret_cast<const char *>(needle_)) {}
+    StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
+        : needle_start{reinterpret_cast<const char *>(needle_start_)}
+        , needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
+    {}
 
     template <typename CharT>
     requires (sizeof(CharT) == 1)
-    const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
+    const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
     {
-        const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
-        if (!res)
-            return haystack_end;
-        return reinterpret_cast<const CharT *>(res);
+        if constexpr (CaseInsensitive)
+        {
+            return std::search(
+                haystack_start, haystack_end, needle_start, needle_end,
+                [](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
+        }
+        else
+        {
+            return std::search(
+                haystack_start, haystack_end, needle_start, needle_end);
+        }
     }
 
     template <typename CharT>
     requires (sizeof(CharT) == 1)
-    const CharT * search(const CharT * haystack, const size_t haystack_size) const
+    const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
     {
-        return search(haystack, haystack + haystack_size);
+        return search(haystack_start, haystack_start + haystack_length);
     }
 };
 
-struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
-{
-    const char * const needle;
-
-    template <typename CharT>
-    requires (sizeof(CharT) == 1)
-    LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
-        : needle(reinterpret_cast<const char *>(needle_)) {}
-
-    template <typename CharT>
-    requires (sizeof(CharT) == 1)
-    const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
-    {
-        const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
-        if (!res)
-            return haystack_end;
-        return reinterpret_cast<const CharT *>(res);
-    }
-
-    template <typename CharT>
-    requires (sizeof(CharT) == 1)
-    const CharT * search(const CharT * haystack, const size_t haystack_size) const
-    {
-        return search(haystack, haystack + haystack_size);
-    }
-};
-
-
 }
diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h
index 5380fcc36d9..76f10373a58 100644
--- a/src/Functions/PositionImpl.h
+++ b/src/Functions/PositionImpl.h
@@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII
     using MultiSearcherInBigHaystack = MultiVolnitsky;
 
     /// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
-    using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
+    using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
 
     static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
     {
@@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII
     /// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
     using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
     using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
-    using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
+    using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ true>;
 
     static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
     {
@@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8
 {
     using SearcherInBigHaystack = VolnitskyUTF8;
     using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
-    using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
+    using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
 
     static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
     {
diff --git a/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference
new file mode 100644
index 00000000000..2b70bdc272e
--- /dev/null
+++ b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference
@@ -0,0 +1,12 @@
+6
+6
+6
+6
+6
+6
+7
+7
+7
+7
+7
+7
diff --git a/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql
new file mode 100644
index 00000000000..6208baf41c4
--- /dev/null
+++ b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql
@@ -0,0 +1,24 @@
+drop table if exists tab;
+
+create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
+insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
+
+select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
+select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
+select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
+
+select countSubstrings(haystack, pattern) from tab where id = 1;
+select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
+select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
+
+insert into tab values (2, 'aaaaa\0x', 'x');
+
+select position('aaaaa\0x', pattern) from tab where id = 2;
+select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
+select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
+
+select position(haystack, pattern) from tab where id = 2;
+select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
+select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
+
+drop table if exists tab;