mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
Make more multi-search methods work with non-const needles
After making function multi[Fuzzy]Match(Any|AnyIndex|AllIndices)() work with non-const needles, 12 more functions started to fail in test "00233_position_function_family": multiSearchAny() multiSearchAnyCaseInsensitive() multiSearchAnyUTF8 multiSearchAnyCaseInsensitiveUTF8() multiSearchFirstPosition() multiSearchFirstPositionCaseInsensitive() multiSearchFirstPositionUTF8() multiSearchFirstPositionCaseInsensitiveUTF8() multiSearchFirstIndex() multiSearchFirstIndexCaseInsensitive() multiSearchFirstIndexUTF8() multiSearchFirstIndexCaseInsensitiveUTF8() Failing queries take the form select 0 = multiSearchAny('\0', CAST([], 'Array(String)'));
This commit is contained in:
parent
ece61f6da3
commit
1eed72b525
@ -165,9 +165,8 @@ struct MultiMatchAllIndicesImpl
|
|||||||
size_t prev_haystack_offset = 0;
|
size_t prev_haystack_offset = 0;
|
||||||
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
||||||
{
|
{
|
||||||
Field field;
|
Field field = needles_col[i];
|
||||||
needles_col.get(i, field);
|
const Array & needles_arr = DB::get<Array &>(field);
|
||||||
Array & needles_arr = DB::get<Array &>(field);
|
|
||||||
|
|
||||||
std::vector<std::string_view> needles;
|
std::vector<std::string_view> needles;
|
||||||
needles.reserve(needles_arr.size());
|
needles.reserve(needles_arr.size());
|
||||||
|
@ -192,9 +192,8 @@ struct MultiMatchAnyImpl
|
|||||||
size_t prev_haystack_offset = 0;
|
size_t prev_haystack_offset = 0;
|
||||||
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
||||||
{
|
{
|
||||||
Field field;
|
Field field = needles_col[i];
|
||||||
needles_col.get(i, field);
|
const Array & needles_arr = DB::get<Array &>(field);
|
||||||
Array & needles_arr = DB::get<Array &>(field);
|
|
||||||
|
|
||||||
std::vector<std::string_view> needles;
|
std::vector<std::string_view> needles;
|
||||||
needles.reserve(needles_arr.size());
|
needles.reserve(needles_arr.size());
|
||||||
|
@ -30,7 +30,7 @@ struct MultiSearchFirstIndexImpl
|
|||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const Array & needles_arr,
|
const Array & needles_arr,
|
||||||
PaddedPODArray<UInt64> & res,
|
PaddedPODArray<UInt64> & res,
|
||||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
@ -47,13 +47,15 @@ struct MultiSearchFirstIndexImpl
|
|||||||
needles.emplace_back(needle.get<String>());
|
needles.emplace_back(needle.get<String>());
|
||||||
|
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
|
||||||
res.resize(haystack_string_size);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
size_t prev_offset = 0;
|
size_t prev_offset = 0;
|
||||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
for (size_t j = 0; j < haystack_size; ++j)
|
||||||
{
|
{
|
||||||
const auto * haystack = &haystack_data[prev_offset];
|
const auto * haystack = &haystack_data[prev_offset];
|
||||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||||
@ -68,10 +70,51 @@ struct MultiSearchFirstIndexImpl
|
|||||||
std::fill(res.begin(), res.end(), 0);
|
std::fill(res.begin(), res.end(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename... Args>
|
static void vectorVector(
|
||||||
static void vectorVector(Args &&...)
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const ColumnArray & needles_col,
|
||||||
|
PaddedPODArray<ResultType> & res,
|
||||||
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
|
bool /*allow_hyperscan*/,
|
||||||
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
{
|
{
|
||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
|
size_t prev_offset = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < haystack_size; ++i)
|
||||||
|
{
|
||||||
|
Field field = needles_col[i];
|
||||||
|
const Array & needles_arr = DB::get<Array &>(field);
|
||||||
|
|
||||||
|
std::vector<std::string_view> needles;
|
||||||
|
needles.reserve(needles_arr.size());
|
||||||
|
for (const auto & needle : needles_arr)
|
||||||
|
needles.emplace_back(needle.get<String>());
|
||||||
|
|
||||||
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
|
||||||
|
|
||||||
|
const auto * const haystack = &haystack_data[prev_offset];
|
||||||
|
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
|
||||||
|
|
||||||
|
size_t iteration = 0;
|
||||||
|
while (searcher.hasMoreToSearch())
|
||||||
|
{
|
||||||
|
if (iteration == 0 || res[i] == 0)
|
||||||
|
{
|
||||||
|
res[i] = searcher.searchOneFirstIndex(haystack, haystack_end);
|
||||||
|
}
|
||||||
|
++iteration;
|
||||||
|
}
|
||||||
|
if (iteration == 0)
|
||||||
|
{
|
||||||
|
res[i] = 0;
|
||||||
|
}
|
||||||
|
prev_offset = haystack_offsets[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ struct MultiSearchFirstPositionImpl
|
|||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const Array & needles_arr,
|
const Array & needles_arr,
|
||||||
PaddedPODArray<UInt64> & res,
|
PaddedPODArray<UInt64> & res,
|
||||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
@ -51,13 +51,15 @@ struct MultiSearchFirstPositionImpl
|
|||||||
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||||
};
|
};
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
|
||||||
res.resize(haystack_string_size);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
size_t prev_offset = 0;
|
size_t prev_offset = 0;
|
||||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
for (size_t j = 0; j < haystack_size; ++j)
|
||||||
{
|
{
|
||||||
const auto * haystack = &haystack_data[prev_offset];
|
const auto * haystack = &haystack_data[prev_offset];
|
||||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||||
@ -77,10 +79,64 @@ struct MultiSearchFirstPositionImpl
|
|||||||
std::fill(res.begin(), res.end(), 0);
|
std::fill(res.begin(), res.end(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename... Args>
|
static void vectorVector(
|
||||||
static void vectorVector(Args &&...)
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const ColumnArray & needles_col,
|
||||||
|
PaddedPODArray<ResultType> & res,
|
||||||
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
|
bool /*allow_hyperscan*/,
|
||||||
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
{
|
{
|
||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
|
size_t prev_offset = 0;
|
||||||
|
|
||||||
|
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
||||||
|
{
|
||||||
|
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t i = 0; i < haystack_size; ++i)
|
||||||
|
{
|
||||||
|
Field field = needles_col[i];
|
||||||
|
const Array & needles_arr = DB::get<Array &>(field);
|
||||||
|
|
||||||
|
std::vector<std::string_view> needles;
|
||||||
|
needles.reserve(needles_arr.size());
|
||||||
|
for (const auto & needle : needles_arr)
|
||||||
|
needles.emplace_back(needle.get<String>());
|
||||||
|
|
||||||
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
|
||||||
|
|
||||||
|
const auto * const haystack = &haystack_data[prev_offset];
|
||||||
|
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
|
||||||
|
|
||||||
|
size_t iteration = 0;
|
||||||
|
while (searcher.hasMoreToSearch())
|
||||||
|
{
|
||||||
|
if (iteration == 0 || res[i] == 0)
|
||||||
|
{
|
||||||
|
res[i] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||||
|
if (result != 0)
|
||||||
|
{
|
||||||
|
res[i] = std::min(result, res[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++iteration;
|
||||||
|
}
|
||||||
|
if (iteration == 0)
|
||||||
|
{
|
||||||
|
res[i] = 0;
|
||||||
|
}
|
||||||
|
prev_offset = haystack_offsets[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ struct MultiSearchImpl
|
|||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const Array & needles_arr,
|
const Array & needles_arr,
|
||||||
PaddedPODArray<UInt8> & res,
|
PaddedPODArray<UInt8> & res,
|
||||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
@ -47,13 +47,15 @@ struct MultiSearchImpl
|
|||||||
needles.emplace_back(needle.get<String>());
|
needles.emplace_back(needle.get<String>());
|
||||||
|
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
|
||||||
res.resize(haystack_string_size);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
size_t prev_offset = 0;
|
size_t prev_offset = 0;
|
||||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
for (size_t j = 0; j < haystack_size; ++j)
|
||||||
{
|
{
|
||||||
const auto * haystack = &haystack_data[prev_offset];
|
const auto * haystack = &haystack_data[prev_offset];
|
||||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||||
@ -67,10 +69,49 @@ struct MultiSearchImpl
|
|||||||
std::fill(res.begin(), res.end(), 0);
|
std::fill(res.begin(), res.end(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename... Args>
|
static void vectorVector(
|
||||||
static void vectorVector(Args &&...)
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const ColumnArray & needles_col,
|
||||||
|
PaddedPODArray<ResultType> & res,
|
||||||
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
|
bool /*allow_hyperscan*/,
|
||||||
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
|
size_t /*max_hyperscan_regexp_total_length*/)
|
||||||
{
|
{
|
||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_size);
|
||||||
|
|
||||||
|
size_t prev_offset = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < haystack_size; ++i)
|
||||||
|
{
|
||||||
|
const auto * const haystack = &haystack_data[prev_offset];
|
||||||
|
const size_t haystack_length = haystack_offsets[i] - prev_offset - 1;
|
||||||
|
|
||||||
|
Field field = needles_col[i];
|
||||||
|
const Array & needles_arr = DB::get<Array &>(field);
|
||||||
|
|
||||||
|
std::vector<std::string_view> needles;
|
||||||
|
needles.reserve(needles_arr.size());
|
||||||
|
for (const auto & needle : needles_arr)
|
||||||
|
needles.emplace_back(needle.get<String>());
|
||||||
|
|
||||||
|
size_t iteration = 0;
|
||||||
|
for (size_t j = 0; j < needles_arr.size(); ++j)
|
||||||
|
{
|
||||||
|
auto searcher = Impl::createSearcherInSmallHaystack(needles[j].data(), needles[j].size());
|
||||||
|
if (iteration == 0 || !res[i])
|
||||||
|
{
|
||||||
|
const auto * match = searcher.search(haystack, haystack_length);
|
||||||
|
res[i] = (match != haystack + haystack_length);
|
||||||
|
}
|
||||||
|
++iteration;
|
||||||
|
}
|
||||||
|
if (iteration == 0)
|
||||||
|
res[i] = 0;
|
||||||
|
prev_offset = haystack_offsets[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -188,3 +188,27 @@ MATCH
|
|||||||
35 Hello .*ell.* 1
|
35 Hello .*ell.* 1
|
||||||
36 Hello o$ 1
|
36 Hello o$ 1
|
||||||
37 Hello hE.*lO 0
|
37 Hello hE.*lO 0
|
||||||
|
MULTISEARCHANY
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
MULTISEARCHFIRSTINDEX
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
MULTISEARCHFIRSTPOSITION
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
-- tests of "(not) (i)like" functions
|
||||||
|
|
||||||
drop table if exists non_const_needle;
|
drop table if exists non_const_needle;
|
||||||
|
|
||||||
create table non_const_needle
|
create table non_const_needle
|
||||||
@ -34,3 +36,35 @@ select id, haystack, needle, match(haystack, needle)
|
|||||||
order by id;
|
order by id;
|
||||||
|
|
||||||
drop table if exists non_const_needle;
|
drop table if exists non_const_needle;
|
||||||
|
|
||||||
|
-- rudimentary tests of "multiSearchFirstIndex()", "multiSearchAnyPosition()" and "multiSearchFirstIndex()" functions
|
||||||
|
|
||||||
|
select 'MULTISEARCHANY';
|
||||||
|
select multiSearchAny(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||||
|
select 0 = multiSearchAny('Hello World', CAST([], 'Array(String)'));
|
||||||
|
select 1 = multiSearchAny(materialize('Hello World'), materialize(['orld']));
|
||||||
|
select 0 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||||
|
select 1 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||||
|
select 1 = multiSearchAnyCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||||
|
select 1 = multiSearchAnyUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||||
|
select 1 = multiSearchAnyCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||||
|
|
||||||
|
select 'MULTISEARCHFIRSTINDEX';
|
||||||
|
select multiSearchFirstIndex(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||||
|
select 0 = multiSearchFirstIndex('Hello World', CAST([], 'Array(String)'));
|
||||||
|
select 1 = multiSearchFirstIndex(materialize('Hello World'), materialize(['orld']));
|
||||||
|
select 0 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||||
|
select 2 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||||
|
select 1 = multiSearchFirstIndexCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||||
|
select 2 = multiSearchFirstIndexUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||||
|
select 1 = multiSearchFirstIndexCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||||
|
|
||||||
|
select 'MULTISEARCHFIRSTPOSITION';
|
||||||
|
select multiSearchFirstPosition(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||||
|
select 0 = multiSearchFirstPosition('Hello World', CAST([], 'Array(String)'));
|
||||||
|
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['orld']));
|
||||||
|
select 0 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||||
|
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||||
|
select 7 = multiSearchFirstPositionCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||||
|
select 13 = multiSearchFirstPositionUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||||
|
select 7 = multiSearchFirstPositionCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||||
|
Loading…
Reference in New Issue
Block a user