mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
Make more multi-search methods work with non-const needles
After making function multi[Fuzzy]Match(Any|AnyIndex|AllIndices)() work with non-const needles, 12 more functions started to fail in test "00233_position_function_family": multiSearchAny() multiSearchAnyCaseInsensitive() multiSearchAnyUTF8 multiSearchAnyCaseInsensitiveUTF8() multiSearchFirstPosition() multiSearchFirstPositionCaseInsensitive() multiSearchFirstPositionUTF8() multiSearchFirstPositionCaseInsensitiveUTF8() multiSearchFirstIndex() multiSearchFirstIndexCaseInsensitive() multiSearchFirstIndexUTF8() multiSearchFirstIndexCaseInsensitiveUTF8() Failing queries take the form select 0 = multiSearchAny('\0', CAST([], 'Array(String)'));
This commit is contained in:
parent
ece61f6da3
commit
1eed72b525
@ -165,9 +165,8 @@ struct MultiMatchAllIndicesImpl
|
||||
size_t prev_haystack_offset = 0;
|
||||
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
||||
{
|
||||
Field field;
|
||||
needles_col.get(i, field);
|
||||
Array & needles_arr = DB::get<Array &>(field);
|
||||
Field field = needles_col[i];
|
||||
const Array & needles_arr = DB::get<Array &>(field);
|
||||
|
||||
std::vector<std::string_view> needles;
|
||||
needles.reserve(needles_arr.size());
|
||||
|
@ -192,9 +192,8 @@ struct MultiMatchAnyImpl
|
||||
size_t prev_haystack_offset = 0;
|
||||
for (size_t i = 0; i < haystack_offsets.size(); ++i)
|
||||
{
|
||||
Field field;
|
||||
needles_col.get(i, field);
|
||||
Array & needles_arr = DB::get<Array &>(field);
|
||||
Field field = needles_col[i];
|
||||
const Array & needles_arr = DB::get<Array &>(field);
|
||||
|
||||
std::vector<std::string_view> needles;
|
||||
needles.reserve(needles_arr.size());
|
||||
|
@ -30,7 +30,7 @@ struct MultiSearchFirstIndexImpl
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const Array & needles_arr,
|
||||
PaddedPODArray<UInt64> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
@ -47,13 +47,15 @@ struct MultiSearchFirstIndexImpl
|
||||
needles.emplace_back(needle.get<String>());
|
||||
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
for (size_t j = 0; j < haystack_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
@ -68,10 +70,51 @@ struct MultiSearchFirstIndexImpl
|
||||
std::fill(res.begin(), res.end(), 0);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorVector(Args &&...)
|
||||
static void vectorVector(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const ColumnArray & needles_col,
|
||||
PaddedPODArray<ResultType> & res,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t prev_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < haystack_size; ++i)
|
||||
{
|
||||
Field field = needles_col[i];
|
||||
const Array & needles_arr = DB::get<Array &>(field);
|
||||
|
||||
std::vector<std::string_view> needles;
|
||||
needles.reserve(needles_arr.size());
|
||||
for (const auto & needle : needles_arr)
|
||||
needles.emplace_back(needle.get<String>());
|
||||
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
|
||||
|
||||
const auto * const haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
|
||||
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
if (iteration == 0 || res[i] == 0)
|
||||
{
|
||||
res[i] = searcher.searchOneFirstIndex(haystack, haystack_end);
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
if (iteration == 0)
|
||||
{
|
||||
res[i] = 0;
|
||||
}
|
||||
prev_offset = haystack_offsets[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -30,7 +30,7 @@ struct MultiSearchFirstPositionImpl
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const Array & needles_arr,
|
||||
PaddedPODArray<UInt64> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
@ -51,13 +51,15 @@ struct MultiSearchFirstPositionImpl
|
||||
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||
};
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
for (size_t j = 0; j < haystack_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
@ -77,10 +79,64 @@ struct MultiSearchFirstPositionImpl
|
||||
std::fill(res.begin(), res.end(), 0);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorVector(Args &&...)
|
||||
static void vectorVector(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const ColumnArray & needles_col,
|
||||
PaddedPODArray<ResultType> & res,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t prev_offset = 0;
|
||||
|
||||
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
||||
{
|
||||
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < haystack_size; ++i)
|
||||
{
|
||||
Field field = needles_col[i];
|
||||
const Array & needles_arr = DB::get<Array &>(field);
|
||||
|
||||
std::vector<std::string_view> needles;
|
||||
needles.reserve(needles_arr.size());
|
||||
for (const auto & needle : needles_arr)
|
||||
needles.emplace_back(needle.get<String>());
|
||||
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
|
||||
|
||||
const auto * const haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[i] - prev_offset - 1;
|
||||
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
if (iteration == 0 || res[i] == 0)
|
||||
{
|
||||
res[i] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||
}
|
||||
else
|
||||
{
|
||||
UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||
if (result != 0)
|
||||
{
|
||||
res[i] = std::min(result, res[i]);
|
||||
}
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
if (iteration == 0)
|
||||
{
|
||||
res[i] = 0;
|
||||
}
|
||||
prev_offset = haystack_offsets[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -30,7 +30,7 @@ struct MultiSearchImpl
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const Array & needles_arr,
|
||||
PaddedPODArray<UInt8> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
@ -47,13 +47,15 @@ struct MultiSearchImpl
|
||||
needles.emplace_back(needle.get<String>());
|
||||
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
for (size_t j = 0; j < haystack_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
@ -67,10 +69,49 @@ struct MultiSearchImpl
|
||||
std::fill(res.begin(), res.end(), 0);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorVector(Args &&...)
|
||||
static void vectorVector(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const ColumnArray & needles_col,
|
||||
PaddedPODArray<ResultType> & res,
|
||||
PaddedPODArray<UInt64> & /*offsets*/,
|
||||
bool /*allow_hyperscan*/,
|
||||
size_t /*max_hyperscan_regexp_length*/,
|
||||
size_t /*max_hyperscan_regexp_total_length*/)
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needles", name);
|
||||
const size_t haystack_size = haystack_offsets.size();
|
||||
res.resize(haystack_size);
|
||||
|
||||
size_t prev_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < haystack_size; ++i)
|
||||
{
|
||||
const auto * const haystack = &haystack_data[prev_offset];
|
||||
const size_t haystack_length = haystack_offsets[i] - prev_offset - 1;
|
||||
|
||||
Field field = needles_col[i];
|
||||
const Array & needles_arr = DB::get<Array &>(field);
|
||||
|
||||
std::vector<std::string_view> needles;
|
||||
needles.reserve(needles_arr.size());
|
||||
for (const auto & needle : needles_arr)
|
||||
needles.emplace_back(needle.get<String>());
|
||||
|
||||
size_t iteration = 0;
|
||||
for (size_t j = 0; j < needles_arr.size(); ++j)
|
||||
{
|
||||
auto searcher = Impl::createSearcherInSmallHaystack(needles[j].data(), needles[j].size());
|
||||
if (iteration == 0 || !res[i])
|
||||
{
|
||||
const auto * match = searcher.search(haystack, haystack_length);
|
||||
res[i] = (match != haystack + haystack_length);
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
if (iteration == 0)
|
||||
res[i] = 0;
|
||||
prev_offset = haystack_offsets[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -188,3 +188,27 @@ MATCH
|
||||
35 Hello .*ell.* 1
|
||||
36 Hello o$ 1
|
||||
37 Hello hE.*lO 0
|
||||
MULTISEARCHANY
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
MULTISEARCHFIRSTINDEX
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
MULTISEARCHFIRSTPOSITION
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
|
@ -1,3 +1,5 @@
|
||||
-- tests of "(not) (i)like" functions
|
||||
|
||||
drop table if exists non_const_needle;
|
||||
|
||||
create table non_const_needle
|
||||
@ -34,3 +36,35 @@ select id, haystack, needle, match(haystack, needle)
|
||||
order by id;
|
||||
|
||||
drop table if exists non_const_needle;
|
||||
|
||||
-- rudimentary tests of "multiSearchFirstIndex()", "multiSearchAnyPosition()" and "multiSearchFirstIndex()" functions
|
||||
|
||||
select 'MULTISEARCHANY';
|
||||
select multiSearchAny(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||
select 0 = multiSearchAny('Hello World', CAST([], 'Array(String)'));
|
||||
select 1 = multiSearchAny(materialize('Hello World'), materialize(['orld']));
|
||||
select 0 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||
select 1 = multiSearchAny(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||
select 1 = multiSearchAnyCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||
select 1 = multiSearchAnyUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||
select 1 = multiSearchAnyCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||
|
||||
select 'MULTISEARCHFIRSTINDEX';
|
||||
select multiSearchFirstIndex(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||
select 0 = multiSearchFirstIndex('Hello World', CAST([], 'Array(String)'));
|
||||
select 1 = multiSearchFirstIndex(materialize('Hello World'), materialize(['orld']));
|
||||
select 0 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||
select 2 = multiSearchFirstIndex(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||
select 1 = multiSearchFirstIndexCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||
select 2 = multiSearchFirstIndexUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||
select 1 = multiSearchFirstIndexCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||
|
||||
select 'MULTISEARCHFIRSTPOSITION';
|
||||
select multiSearchFirstPosition(materialize('Hello World'), materialize([])); -- { serverError 43 }
|
||||
select 0 = multiSearchFirstPosition('Hello World', CAST([], 'Array(String)'));
|
||||
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['orld']));
|
||||
select 0 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'Welt']));
|
||||
select 8 = multiSearchFirstPosition(materialize('Hello World'), materialize(['Hallo', 'orld']));
|
||||
select 7 = multiSearchFirstPositionCaseInsensitive(materialize('Hello World'), materialize(['WORLD']));
|
||||
select 13 = multiSearchFirstPositionUTF8(materialize('Hello World £'), materialize(['WORLD', '£']));
|
||||
select 7 = multiSearchFirstPositionCaseInsensitiveUTF8(materialize('Hello World £'), materialize(['WORLD']));
|
||||
|
Loading…
Reference in New Issue
Block a user