Fix Bug 37114 - ilike on FixedString(N) columns produces wrong results

The main fix is in MatchImpl.h where the "case_insensitive" parameter is
added to Regexps::get().

Also made "case_insensitive" a non-default template parameter to reduce
the risk of future bugs.

The remainder of this commit are minor random code improvements.

resoves #37114
This commit is contained in:
Robert Schulze 2022-05-11 13:54:26 +02:00
parent 48c165b360
commit 7232f47c68
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
9 changed files with 38 additions and 30 deletions

View File

@ -94,7 +94,7 @@ public:
template <typename Factory>
Pointer get(const Key & key, Factory && f)
{
std::unique_lock lock(mutex);
std::lock_guard lock(mutex);
auto it = container.find(key);
if (container.end() == it)

View File

@ -477,7 +477,7 @@ public:
ErrorCodes::ILLEGAL_COLUMN);
if (!col->getValue<String>().empty())
re = Regexps::get<false, false>(col->getValue<String>());
re = Regexps::get<false, false, false>(col->getValue<String>());
}
@ -560,7 +560,7 @@ public:
+ " of first argument of function " + getName() + ". Must be constant string.",
ErrorCodes::ILLEGAL_COLUMN);
re = Regexps::get<false, false>(col->getValue<String>());
re = Regexps::get<false, false, false>(col->getValue<String>());
capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;
matches.resize(capture + 1);

View File

@ -24,12 +24,11 @@ namespace ErrorCodes
/// Is the [I]LIKE expression reduced to finding a substring in a string?
static inline bool likePatternIsStrstr(const String & pattern, String & res)
{
res = "";
if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%')
return false;
res.reserve(pattern.size() * 2);
res = "";
res.reserve(pattern.size() - 2);
const char * pos = pattern.data();
const char * end = pos + pattern.size();
@ -81,7 +80,7 @@ struct MatchImpl
static void vectorConstant(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
const String & pattern,
const ColumnPtr & start_pos,
PaddedPODArray<UInt8> & res)
{
@ -92,14 +91,13 @@ struct MatchImpl
if (offsets.empty())
return;
String strstr_pattern;
/// A simple case where the [I]LIKE expression reduces to finding a substring in a string
String strstr_pattern;
if (like && likePatternIsStrstr(pattern, strstr_pattern))
{
const UInt8 * begin = data.data();
const UInt8 * const begin = data.data();
const UInt8 * const end = data.data() + data.size();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
@ -137,7 +135,7 @@ struct MatchImpl
auto regexp = Regexps::get<like, true, case_insensitive>(pattern);
std::string required_substring;
String required_substring;
bool is_trivial;
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
@ -172,9 +170,9 @@ struct MatchImpl
{
/// NOTE This almost matches with the case of LikePatternIsStrstr.
const UInt8 * begin = data.data();
const UInt8 * const begin = data.data();
const UInt8 * const end = data.begin() + data.size();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
@ -230,6 +228,7 @@ struct MatchImpl
++i;
}
/// Tail, in which there can be no substring.
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
@ -238,14 +237,14 @@ struct MatchImpl
/// Very carefully crafted copy-paste.
static void vectorFixedConstant(
const ColumnString::Chars & data, size_t n, const std::string & pattern,
const ColumnString::Chars & data, size_t n, const String & pattern,
PaddedPODArray<UInt8> & res)
{
if (data.empty())
return;
String strstr_pattern;
/// A simple case where the LIKE expression reduces to finding a substring in a string
String strstr_pattern;
if (like && likePatternIsStrstr(pattern, strstr_pattern))
{
const UInt8 * begin = data.data();
@ -291,9 +290,9 @@ struct MatchImpl
{
size_t size = data.size() / n;
auto regexp = Regexps::get<like, true>(pattern);
auto regexp = Regexps::get<like, true, case_insensitive>(pattern);
std::string required_substring;
String required_substring;
bool is_trivial;
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.

View File

@ -44,23 +44,20 @@ namespace Regexps
template <bool like>
inline Regexp createRegexp(const std::string & pattern, int flags)
{
return {pattern, flags};
}
template <>
inline Regexp createRegexp<true>(const std::string & pattern, int flags)
{
return {likePatternToRegexp(pattern), flags};
if constexpr (like)
return {likePatternToRegexp(pattern), flags};
else
return {pattern, flags};
}
/** Returns holder of an object from Pool.
* You must hold the ownership while using the object.
* In destructor, it returns the object back to the Pool for further reuse.
*/
template <bool like, bool no_capture, bool case_insensitive = false>
template <bool like, bool no_capture, bool case_insensitive>
inline Pool::Pointer get(const std::string & pattern)
{
/// C++11 has thread-safe function-local static on most modern compilers.
/// the Singleton is thread-safe in C++11
static Pool known_regexps; /// Different variables for different pattern parameters.
return known_regexps.get(pattern, [&pattern]

View File

@ -21,7 +21,7 @@ struct ExtractImpl
res_data.reserve(data.size() / 5);
res_offsets.resize(offsets.size());
const auto & regexp = Regexps::get<false, false>(pattern);
const auto & regexp = Regexps::get<false, false, false>(pattern);
unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0;
OptimizedRegularExpression::MatchVec matches;

View File

@ -95,7 +95,7 @@ public:
throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);
using StringPiece = typename Regexps::Regexp::StringPieceType;
auto holder = Regexps::get<false, false>(needle);
auto holder = Regexps::get<false, false, false>(needle);
const auto & regexp = holder->getRE2();
if (!regexp)

View File

@ -63,7 +63,7 @@ public:
if (needle.empty())
throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);
auto regexp = Regexps::get<false, false>(needle);
auto regexp = Regexps::get<false, false, false>(needle);
const auto & re2 = regexp->getRE2();
if (!re2)

View File

@ -0,0 +1,2 @@
AA 0 1
Aa 1 1

View File

@ -0,0 +1,10 @@
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (col FixedString(2)) engine = MergeTree() ORDER BY col;
INSERT INTO tab VALUES ('AA') ('Aa');
SELECT col, col LIKE '%a', col ILIKE '%a' FROM tab WHERE col = 'AA';
SELECT col, col LIKE '%a', col ILIKE '%a' FROM tab WHERE col = 'Aa';
DROP TABLE IF EXISTS tab;