mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Issue 7334: Fixed utf8 string case-insensitive searching issue
This commit is contained in:
parent
189ff60f32
commit
8b24688afb
@ -24,7 +24,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_PARAMETER;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -34,9 +33,12 @@ namespace ErrorCodes
|
||||
*/
|
||||
|
||||
|
||||
struct StringSearcherBase
|
||||
class StringSearcherBase
|
||||
{
|
||||
public:
|
||||
bool force_fallback = false;
|
||||
#ifdef __SSE2__
|
||||
protected:
|
||||
static constexpr auto n = sizeof(__m128i);
|
||||
const int page_size = ::getPageSize();
|
||||
|
||||
@ -53,7 +55,7 @@ template <bool CaseSensitive, bool ASCII> class StringSearcher;
|
||||
|
||||
/// Case-insensitive UTF-8 searcher
|
||||
template <>
|
||||
class StringSearcher<false, false> : private StringSearcherBase
|
||||
class StringSearcher<false, false> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
using UTF8SequenceBuffer = uint8_t[6];
|
||||
@ -119,11 +121,14 @@ public:
|
||||
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
|
||||
|
||||
if (length_l != length_u)
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
force_fallback = true;
|
||||
}
|
||||
|
||||
l = l_seq[0];
|
||||
u = u_seq[0];
|
||||
|
||||
if (force_fallback)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
@ -158,7 +163,10 @@ public:
|
||||
|
||||
/// @note Unicode standard states it is a rare but possible occasion
|
||||
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
{
|
||||
force_fallback = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cache_actual_len += src_len;
|
||||
@ -199,9 +207,10 @@ public:
|
||||
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
|
||||
break;
|
||||
|
||||
/// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
|
||||
const auto len = UTF8::seqLength(*haystack_pos);
|
||||
auto len = UTF8::seqLength(*haystack_pos);
|
||||
haystack_pos += len;
|
||||
|
||||
len = UTF8::seqLength(*needle_pos);
|
||||
needle_pos += len;
|
||||
}
|
||||
|
||||
@ -213,7 +222,7 @@ public:
|
||||
{
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
if (pageSafe(pos) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
||||
@ -262,7 +271,7 @@ public:
|
||||
while (haystack < haystack_end)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack))
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
||||
@ -339,7 +348,7 @@ public:
|
||||
|
||||
/// Case-insensitive ASCII searcher
|
||||
template <>
|
||||
class StringSearcher<false, true> : private StringSearcherBase
|
||||
class StringSearcher<false, true> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -541,7 +550,7 @@ public:
|
||||
|
||||
/// Case-sensitive searcher (both ASCII and UTF-8)
|
||||
template <bool ASCII>
|
||||
class StringSearcher<true, ASCII> : private StringSearcherBase
|
||||
class StringSearcher<true, ASCII> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -725,7 +734,7 @@ public:
|
||||
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
||||
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
||||
template <typename StringSearcher>
|
||||
class TokenSearcher
|
||||
class TokenSearcher: public StringSearcherBase
|
||||
{
|
||||
StringSearcher searcher;
|
||||
size_t needle_size;
|
||||
@ -809,7 +818,7 @@ using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStri
|
||||
* It is required that strings are zero-terminated.
|
||||
*/
|
||||
|
||||
struct LibCASCIICaseSensitiveStringSearcher
|
||||
struct LibCASCIICaseSensitiveStringSearcher: public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
@ -833,7 +842,7 @@ struct LibCASCIICaseSensitiveStringSearcher
|
||||
}
|
||||
};
|
||||
|
||||
struct LibCASCIICaseInsensitiveStringSearcher
|
||||
struct LibCASCIICaseInsensitiveStringSearcher: public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
|
@ -372,7 +372,7 @@ public:
|
||||
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
|
||||
, fallback_searcher{needle_, needle_size}
|
||||
{
|
||||
if (fallback)
|
||||
if (fallback || fallback_searcher.force_fallback)
|
||||
return;
|
||||
|
||||
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
|
||||
@ -393,7 +393,7 @@ public:
|
||||
|
||||
const auto haystack_end = haystack + haystack_size;
|
||||
|
||||
if (fallback || haystack_size <= needle_size)
|
||||
if (fallback || haystack_size <= needle_size || fallback_searcher.force_fallback)
|
||||
return fallback_searcher.search(haystack, haystack_end);
|
||||
|
||||
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
|
||||
|
@ -90,3 +90,31 @@
|
||||
21
|
||||
22
|
||||
23
|
||||
6
|
||||
7
|
||||
7
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
|
@ -93,3 +93,34 @@ SELECT position(concat(' иголка.ру', arrayStringConcat
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test ß test'), 'ß') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test AaßAa test'), 'aßa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test A1ß2a test'), '1ß2') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
Loading…
Reference in New Issue
Block a user