From 1326bffe606b92e5501db0d62668234005b7790b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 23 Sep 2023 04:14:03 +0200 Subject: [PATCH] Allow LIKE over binary data --- src/Common/OptimizedRegularExpression.cpp | 8 ++++++ .../0_stateless/02886_binary_like.reference | 24 +++++++++++++++++ .../queries/0_stateless/02886_binary_like.sql | 26 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 tests/queries/0_stateless/02886_binary_like.reference create mode 100644 tests/queries/0_stateless/02886_binary_like.sql diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index 3501a355c19..d64b26a28a3 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -496,6 +496,14 @@ OptimizedRegularExpression::OptimizedRegularExpression(const std::string & regex regexp_options.set_dot_nl(true); re2 = std::make_unique(regexp_, regexp_options); + + /// Fallback to latin1 to allow matching binary data. + if (!re2->ok() && re2->error_code() == re2::RE2::ErrorCode::ErrorBadUTF8) + { + regexp_options.set_encoding(re2::RE2::Options::EncodingLatin1); + re2 = std::make_unique(regexp_, regexp_options); + } + if (!re2->ok()) { throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP, diff --git a/tests/queries/0_stateless/02886_binary_like.reference b/tests/queries/0_stateless/02886_binary_like.reference new file mode 100644 index 00000000000..c23ceaf9784 --- /dev/null +++ b/tests/queries/0_stateless/02886_binary_like.reference @@ -0,0 +1,24 @@ +1 +1 +1 +1 +1 +0 +0 +1 +1 +1 +1 +1 +1 +0 +0 +1 +1 +1 +1 +1 +1 +0 +0 +1 diff --git a/tests/queries/0_stateless/02886_binary_like.sql b/tests/queries/0_stateless/02886_binary_like.sql new file mode 100644 index 00000000000..ba11f1fc060 --- /dev/null +++ b/tests/queries/0_stateless/02886_binary_like.sql @@ -0,0 +1,26 @@ +SELECT 'aяb' LIKE 'a_b'; +SELECT 'a\0b' LIKE 'a_b'; +SELECT 'a\0b' LIKE 'a\0b'; +SELECT 'a\0b' LIKE 'a%\0b'; +SELECT 'a\xFFb' LIKE 'a%\xFFb'; +SELECT 'a\xFFb' LIKE 'a%\xFF\xFEb'; +SELECT 'a\xFFb' LIKE '%a\xFF\xFEb'; +SELECT 'a\xFF\xFEb' LIKE '%a\xFF\xFEb'; + +SELECT materialize('aяb') LIKE 'a_b'; +SELECT materialize('a\0b') LIKE 'a_b'; +SELECT materialize('a\0b') LIKE 'a\0b'; +SELECT materialize('a\0b') LIKE 'a%\0b'; +SELECT materialize('a\xFFb') LIKE 'a%\xFFb'; +SELECT materialize('a\xFFb') LIKE 'a%\xFF\xFEb'; +SELECT materialize('a\xFFb') LIKE '%a\xFF\xFEb'; +SELECT materialize('a\xFF\xFEb') LIKE '%a\xFF\xFEb'; + +SELECT materialize('aяb') LIKE materialize('a_b'); +SELECT materialize('a\0b') LIKE materialize('a_b'); +SELECT materialize('a\0b') LIKE materialize('a\0b'); +SELECT materialize('a\0b') LIKE materialize('a%\0b'); +SELECT materialize('a\xFFb') LIKE materialize('a%\xFFb'); +SELECT materialize('a\xFFb') LIKE materialize('a%\xFF\xFEb'); +SELECT materialize('a\xFFb') LIKE materialize('%a\xFF\xFEb'); +SELECT materialize('a\xFF\xFEb') LIKE materialize('%a\xFF\xFEb');