ClickHouse/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql
Robert Schulze 81bb2242fd
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.

    create table t (id UInt32, pattern String) engine = MergeTree() order by id;
    insert into t values (1, 'x');
    select countSubstrings('aaaxxxaa\0xxx', pattern) from t;

We returned 3 before this commit, now we return 6
2022-06-29 21:41:18 +00:00

25 lines
1.1 KiB
SQL

drop table if exists tab;
create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstrings(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
insert into tab values (2, 'aaaaa\0x', 'x');
select position('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
select position(haystack, pattern) from tab where id = 2;
select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
drop table if exists tab;