Make more multi-search methods work with non-const needles

After making function multi[Fuzzy]Match(Any|AnyIndex|AllIndices)() work
with non-const needles, 12 more functions started to fail in test
"00233_position_function_family":

multiSearchAny()
multiSearchAnyCaseInsensitive()
multiSearchAnyUTF8
multiSearchAnyCaseInsensitiveUTF8()

multiSearchFirstPosition()
multiSearchFirstPositionCaseInsensitive()
multiSearchFirstPositionUTF8()
multiSearchFirstPositionCaseInsensitiveUTF8()

multiSearchFirstIndex()
multiSearchFirstIndexCaseInsensitive()
multiSearchFirstIndexUTF8()
multiSearchFirstIndexCaseInsensitiveUTF8()

Failing queries take the form
  select 0 = multiSearchAny('\0', CAST([], 'Array(String)'));
This commit is contained in:
Robert Schulze 2022-06-29 10:37:42 +00:00
parent ece61f6da3
commit 1eed72b525
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
7 changed files with 223 additions and 27 deletions

View File

@ -165,9 +165,8 @@ struct MultiMatchAllIndicesImpl
size_t prev_haystack_offset = 0;
for (size_t i = 0; i < haystack_offsets.size(); ++i)
{
Field field;
needles_col.get(i, field);
Array & needles_arr = DB::get<Array &>(field);
Field field = needles_col[i];
const Array & needles_arr = DB::get<Array &>(field);
std::vector<std::string_view> needles;
needles.reserve(needles_arr.size());

View File

@ -192,9 +192,8 @@ struct MultiMatchAnyImpl
size_t prev_haystack_offset = 0;
for (size_t i = 0; i < haystack_offsets.size(); ++i)
{
Field field;
needles_col.get(i, field);
Array & needles_arr = DB::get<Array &>(field);
Field field = needles_col[i];
const Array & needles_arr = DB::get<Array &>(field);
std::vector<std::string_view> needles;
needles.reserve(needles_arr.size());

View File

@ -30,7 +30,7 @@ struct MultiSearchFirstIndexImpl
const ColumnString::Offsets & haystack_offsets,
const Array & needles_arr,
PaddedPODArray<UInt64> & res,
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
@ -47,13 +47,15 @@ struct MultiSearchFirstIndexImpl
needles.emplace_back(needle.get<String>());
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
for (size_t j = 0; j < haystack_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
@ -68,10 +70,51 @@ struct MultiSearchFirstIndexImpl
std::fill(res.begin(), res.end(), 0);
}
template <typename... Args>
static void vectorVector(Args &&...)
static void vectorVector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnArray & needles_col,
PaddedPODArray<ResultType> & res,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t prev_offset = 0;
for (size_t i = 0; i < haystack_size; ++i)
{
Field field = needles_col[i];
const Array & needles_arr = DB::get<Array &>(field);
std::vector<std::string_view> needles;
needles.reserve(needles_arr.size());
for (const auto & needle : needles_arr)
needles.emplace_back(needle.get<String>());
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
const auto * const haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
if (iteration == 0 || res[i] == 0)
{
res[i] = searcher.searchOneFirstIndex(haystack, haystack_end);
}
++iteration;
}
if (iteration == 0)
{
res[i] = 0;
}
prev_offset = haystack_offsets[i];
}
}
};

View File

@ -30,7 +30,7 @@ struct MultiSearchFirstPositionImpl
const ColumnString::Offsets & haystack_offsets,
const Array & needles_arr,
PaddedPODArray<UInt64> & res,
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
@ -51,13 +51,15 @@ struct MultiSearchFirstPositionImpl
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
for (size_t j = 0; j < haystack_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
@ -77,10 +79,64 @@ struct MultiSearchFirstPositionImpl
std::fill(res.begin(), res.end(), 0);
}
template <typename... Args>
static void vectorVector(Args &&...)
static void vectorVector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnArray & needles_col,
PaddedPODArray<ResultType> & res,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t prev_offset = 0;
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
for (size_t i = 0; i < haystack_size; ++i)
{
Field field = needles_col[i];
const Array & needles_arr = DB::get<Array &>(field);
std::vector<std::string_view> needles;
needles.reserve(needles_arr.size());
for (const auto & needle : needles_arr)
needles.emplace_back(needle.get<String>());
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
const auto * const haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
if (iteration == 0 || res[i] == 0)
{
res[i] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
}
else
{
UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
if (result != 0)
{
res[i] = std::min(result, res[i]);
}
}
++iteration;
}
if (iteration == 0)
{
res[i] = 0;
}
prev_offset = haystack_offsets[i];
}
}
};

View File

@ -30,7 +30,7 @@ struct MultiSearchImpl
const ColumnString::Offsets & haystack_offsets,
const Array & needles_arr,
PaddedPODArray<UInt8> & res,
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
@ -47,13 +47,15 @@ struct MultiSearchImpl
needles.emplace_back(needle.get<String>());
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
for (size_t j = 0; j < haystack_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
@ -67,10 +69,49 @@ struct MultiSearchImpl
std::fill(res.begin(), res.end(), 0);
}
template <typename... Args>
static void vectorVector(Args &&...)
static void vectorVector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnArray & needles_col,
PaddedPODArray<ResultType> & res,
PaddedPODArray<UInt64> & /*offsets*/,
bool /*allow_hyperscan*/,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
const size_t haystack_size = haystack_offsets.size();
res.resize(haystack_size);
size_t prev_offset = 0;
for (size_t i = 0; i < haystack_size; ++i)
{
const auto * const haystack = &haystack_data[prev_offset];
const size_t haystack_length = haystack_offsets[i] - prev_offset - 1;
Field field = needles_col[i];
const Array & needles_arr = DB::get<Array &>(field);
std::vector<std::string_view> needles;
needles.reserve(needles_arr.size());
for (const auto & needle : needles_arr)
needles.emplace_back(needle.get<String>());
size_t iteration = 0;
for (size_t j = 0; j < needles_arr.size(); ++j)
{
auto searcher = Impl::createSearcherInSmallHaystack(needles[j].data(), needles[j].size());
if (iteration == 0 || !res[i])
{
const auto * match = searcher.search(haystack, haystack_length);
res[i] = (match != haystack + haystack_length);
}
++iteration;
}
if (iteration == 0)
res[i] = 0;
prev_offset = haystack_offsets[i];
}
}
};

View File

@ -188,3 +188,27 @@ MATCH
35 Hello .*ell.* 1
36 Hello o$ 1
37 Hello hE.*lO 0
MULTISEARCHANY
1
1
1
1
1
1
1
MULTISEARCHFIRSTINDEX
1
1
1
1
1
1
1
MULTISEARCHFIRSTPOSITION
1
1
1
1
1
1
1

View File

@ -1,3 +1,5 @@
-- tests of "(not) (i)like" functions
drop table if exists non_const_needle;
create table non_const_needle
@ -34,3 +36,35 @@ select id, haystack, needle, match(haystack, needle)
order by id;
drop table if exists non_const_needle;
-- rudimentary tests of "multiSearchFirstIndex()", "multiSearchAnyPosition()" and "multiSearchFirstIndex()" functions
select 'MULTISEARCHANY';
select multiSearchAny(materialize('Hello World'), materialize([])); -- { serverError 43 }
select 0 = multiSearchAny('Hello World', CAST([], 'Array(String)'));
select 1 = multiSearchAny(materialize('Hello World'), materialize(['orld']));
select 0 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'Welt']));
select 1 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'orld']));
select 1 = multiSearchAnyCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
select 1 = multiSearchAnyUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
select 1 = multiSearchAnyCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
select 'MULTISEARCHFIRSTINDEX';
select multiSearchFirstIndex(materialize('Hello World'), materialize([])); -- { serverError 43 }
select 0 = multiSearchFirstIndex('Hello World', CAST([], 'Array(String)'));
select 1 = multiSearchFirstIndex(materialize('Hello World'), materialize(['orld']));
select 0 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'Welt']));
select 2 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'orld']));
select 1 = multiSearchFirstIndexCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
select 2 = multiSearchFirstIndexUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
select 1 = multiSearchFirstIndexCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
select 'MULTISEARCHFIRSTPOSITION';
select multiSearchFirstPosition(materialize('Hello World'), materialize([])); -- { serverError 43 }
select 0 = multiSearchFirstPosition('Hello World', CAST([], 'Array(String)'));
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['orld']));
select 0 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'Welt']));
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'orld']));
select 7 = multiSearchFirstPositionCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
select 13 = multiSearchFirstPositionUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
select 7 = multiSearchFirstPositionCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));