2020-03-29 17:04:16 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <Columns/ColumnString.h>
|
2023-06-26 15:00:46 +00:00
|
|
|
#include <Common/StringSearcher.h>
|
2022-05-16 20:23:51 +00:00
|
|
|
#include <Core/ColumnNumbers.h>
|
2020-03-29 17:04:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2020-03-30 13:21:54 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2023-06-26 15:00:46 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2020-03-30 13:21:54 +00:00
|
|
|
extern const int ILLEGAL_COLUMN;
|
2020-08-04 07:05:16 +00:00
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
2020-03-30 13:21:54 +00:00
|
|
|
}
|
|
|
|
|
2020-03-29 17:04:16 +00:00
|
|
|
/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
|
|
|
|
*/
|
2023-06-28 08:41:39 +00:00
|
|
|
template <typename Name, typename Searcher, bool negate>
|
2020-03-29 17:04:16 +00:00
|
|
|
struct HasTokenImpl
|
|
|
|
{
|
|
|
|
using ResultType = UInt8;
|
|
|
|
|
|
|
|
static constexpr bool use_default_implementation_for_constants = true;
|
2020-08-01 21:14:23 +00:00
|
|
|
static constexpr bool supports_start_pos = false;
|
2021-09-21 16:43:46 +00:00
|
|
|
static constexpr auto name = Name::name;
|
2020-03-29 17:04:16 +00:00
|
|
|
|
2023-01-19 13:32:55 +00:00
|
|
|
static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2}; }
|
2022-05-16 20:23:51 +00:00
|
|
|
|
2020-03-29 17:04:16 +00:00
|
|
|
static void vectorConstant(
|
2022-05-13 08:52:25 +00:00
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
2020-08-01 21:14:23 +00:00
|
|
|
const std::string & pattern,
|
|
|
|
const ColumnPtr & start_pos,
|
2023-01-17 13:27:41 +00:00
|
|
|
PaddedPODArray<UInt8> & res,
|
2023-01-23 22:27:48 +00:00
|
|
|
ColumnUInt8 * res_null)
|
2020-03-29 17:04:16 +00:00
|
|
|
{
|
2020-08-02 14:24:39 +00:00
|
|
|
if (start_pos != nullptr)
|
2021-09-21 16:43:46 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' does not support start_pos argument", name);
|
2020-08-01 21:14:23 +00:00
|
|
|
|
2023-07-16 22:10:42 +00:00
|
|
|
if (pattern.empty())
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Needle cannot be empty, because empty string isn't a token");
|
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
if (haystack_offsets.empty())
|
2020-03-29 17:04:16 +00:00
|
|
|
return;
|
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
const UInt8 * const begin = haystack_data.data();
|
|
|
|
const UInt8 * const end = haystack_data.data() + haystack_data.size();
|
2020-03-29 17:04:16 +00:00
|
|
|
const UInt8 * pos = begin;
|
|
|
|
|
2023-06-28 08:41:39 +00:00
|
|
|
if (!std::none_of(pattern.begin(), pattern.end(), isTokenSeparator))
|
2023-01-17 13:27:41 +00:00
|
|
|
{
|
2023-01-23 22:27:48 +00:00
|
|
|
if (res_null)
|
|
|
|
{
|
2023-06-26 15:00:46 +00:00
|
|
|
std::ranges::fill(res, 0);
|
|
|
|
std::ranges::fill(res_null->getData(), true);
|
|
|
|
return;
|
2023-01-23 22:27:48 +00:00
|
|
|
}
|
2023-06-26 15:00:46 +00:00
|
|
|
else
|
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Needle must not contain whitespace or separator characters");
|
2023-01-17 13:27:41 +00:00
|
|
|
}
|
2023-06-26 15:00:46 +00:00
|
|
|
|
2023-06-28 08:41:39 +00:00
|
|
|
size_t pattern_size = pattern.size();
|
|
|
|
Searcher searcher(pattern.data(), pattern_size, end - pos);
|
2023-06-26 15:00:46 +00:00
|
|
|
if (res_null)
|
|
|
|
std::ranges::fill(res_null->getData(), false);
|
|
|
|
|
|
|
|
/// The current index in the array of strings.
|
|
|
|
size_t i = 0;
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
2023-01-17 13:27:41 +00:00
|
|
|
{
|
2023-06-28 08:41:39 +00:00
|
|
|
/// The found substring is a token
|
|
|
|
if ((pos == begin || isTokenSeparator(pos[-1]))
|
|
|
|
&& (pos + pattern_size == end || isTokenSeparator(pos[pattern_size])))
|
2020-03-29 17:04:16 +00:00
|
|
|
{
|
2023-06-28 08:41:39 +00:00
|
|
|
/// Let's determine which index it refers to.
|
|
|
|
while (begin + haystack_offsets[i] <= pos)
|
|
|
|
{
|
|
|
|
res[i] = negate;
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
|
|
|
if (pos + pattern.size() < begin + haystack_offsets[i])
|
|
|
|
res[i] = !negate;
|
|
|
|
else
|
|
|
|
res[i] = negate;
|
|
|
|
|
|
|
|
pos = begin + haystack_offsets[i];
|
2023-06-26 15:00:46 +00:00
|
|
|
++i;
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
2023-06-26 15:00:46 +00:00
|
|
|
else
|
2023-06-28 08:41:39 +00:00
|
|
|
{
|
|
|
|
/// Not a token. Jump over it.
|
|
|
|
pos += pattern_size;
|
|
|
|
}
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
2023-06-26 15:00:46 +00:00
|
|
|
|
|
|
|
/// Tail, in which there can be no substring.
|
|
|
|
if (i < res.size())
|
|
|
|
memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename... Args>
|
|
|
|
static void vectorVector(Args &&...)
|
|
|
|
{
|
2021-09-21 16:43:46 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name);
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Search different needles in single haystack.
|
|
|
|
template <typename... Args>
|
|
|
|
static void constantVector(Args &&...)
|
|
|
|
{
|
2021-09-21 16:43:46 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name);
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename... Args>
|
|
|
|
static void vectorFixedConstant(Args &&...)
|
|
|
|
{
|
2021-09-21 16:43:46 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name);
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
|
|
|
template <typename... Args>
|
|
|
|
static void vectorFixedVector(Args &&...)
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name);
|
|
|
|
}
|
2023-06-28 08:41:39 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
static bool isTokenSeparator(UInt8 c)
|
|
|
|
{
|
|
|
|
return isASCII(c) && !isAlphaNumericASCII(c);
|
|
|
|
}
|
2020-03-29 17:04:16 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|