Merge pull request #52050 from arenadata/ADQM-982

Subsequence string matching (new hasSubsequence() function)
This commit is contained in:
robot-clickhouse-ci-2 2023-07-23 21:18:27 +02:00 committed by GitHub
commit 66c0015b87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 521 additions and 1 deletions

View File

@ -631,3 +631,53 @@ Result:
│ 100 │ 200 │ 100-200 │ 100 │
└──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴───────────────────────────────────────────┘
```
## hasSubsequence
Returns 1 if needle is a subsequence of haystack, or 0 otherwise.
A subsequence of a string is a sequence that can be derived from the given string by deleting zero or more elements without changing the order of the remaining elements.
**Syntax**
``` sql
hasSubsequence(haystack, needle)
```
**Arguments**
- `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — Subsequence to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
**Returned values**
- 1, if needle is a subsequence of haystack.
- 0, otherwise.
Type: `UInt8`.
**Examples**
``` sql
SELECT hasSubsequence('garbage', 'arg') ;
```
Result:
``` text
┌─hasSubsequence('garbage', 'arg')─┐
│ 1 │
└──────────────────────────────────┘
```
## hasSubsequenceCaseInsensitive
Like [hasSubsequence](#hasSubsequence) but searches case-insensitively.
## hasSubsequenceUTF8
Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings.
## hasSubsequenceCaseInsensitiveUTF8
Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively.

View File

@ -801,3 +801,55 @@ SELECT countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв');
│ 3 │
└────────────────────────────────────────────────────────────┘
```
## hasSubsequence(haystack, needle) {#hasSubsequence}
Возвращает 1 если needle является подпоследовательностью haystack, иначе 0.
**Синтаксис**
``` sql
hasSubsequence(haystack, needle)
```
**Аргументы**
- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal).
- `needle` — подпоследовательность, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal).
**Возвращаемые значения**
- 1, если
- 0, если подстрока не найдена.
Тип: `UInt8`.
**Примеры**
Запрос:
``` sql
SELECT hasSubsequence('garbage', 'arg') ;
```
Результат:
``` text
┌─hasSubsequence('garbage', 'arg')─┐
│ 1 │
└──────────────────────────────────┘
```
## hasSubsequenceCaseInsensitive
Такая же, как и [hasSubsequence](#hasSubsequence), но работает без учета регистра.
## hasSubsequenceUTF8
Такая же, как и [hasSubsequence](#hasSubsequence) при допущении что `haystack` и `needle` содержат набор кодовых точек, представляющий текст в кодировке UTF-8.
## hasSubsequenceCaseInsensitiveUTF8
Такая же, как и [hasSubsequenceUTF8](#hasSubsequenceUTF8), но работает без учета регистра.

View File

@ -0,0 +1,158 @@
#pragma once
#include <Columns/ColumnString.h>
#include <Columns/ColumnConst.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/GatherUtils/Sources.h>
#include <Functions/GatherUtils/Sinks.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
}
namespace
{
using namespace GatherUtils;
template <typename Name, typename Impl>
class HasSubsequenceImpl : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(ContextPtr) { return std::make_shared<HasSubsequenceImpl>(); }
String getName() const override { return name; }
bool isVariadic() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 2; }
bool useDefaultImplementationForConstants() const override { return false; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {};}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}",
arguments[0]->getName(), getName());
if (!isString(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}",
arguments[1]->getName(), getName());
return std::make_shared<DataTypeNumber<UInt8>>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override
{
const ColumnPtr & column_haystack = arguments[0].column;
const ColumnPtr & column_needle = arguments[1].column;
const ColumnConst * haystack_const_string = checkAndGetColumnConst<ColumnString>(column_haystack.get());
const ColumnConst * needle_const_string = checkAndGetColumnConst<ColumnString>(column_needle.get());
const ColumnString * haystack_string = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnString * needle_string = checkAndGetColumn<ColumnString>(&*column_needle);
auto col_res = ColumnVector<UInt8>::create();
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();
vec_res.resize(input_rows_count);
if (haystack_string && needle_string)
execute(StringSource{*haystack_string}, StringSource{*needle_string}, vec_res);
else if (haystack_string && needle_const_string)
execute(StringSource{*haystack_string}, ConstSource<StringSource>{*needle_const_string}, vec_res);
else if (haystack_const_string && needle_string)
execute(ConstSource<StringSource>{*haystack_const_string}, StringSource{*needle_string}, vec_res);
else if (haystack_const_string && needle_const_string)
execute(ConstSource<StringSource>{*haystack_const_string}, ConstSource<StringSource>{*needle_const_string}, vec_res);
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal columns {} and {} of arguments of function {}",
arguments[0].column->getName(),
arguments[1].column->getName(),
getName());
return col_res;
}
private:
template <typename SourceHaystack, typename SourceNeedle>
void execute(
SourceHaystack && haystacks,
SourceNeedle && needles,
PaddedPODArray<UInt8> & res_data) const
{
while (!haystacks.isEnd())
{
auto haystack_slice = haystacks.getWhole();
auto needle_slice = needles.getWhole();
size_t row_num = haystacks.rowNum();
if constexpr (!Impl::is_utf8)
res_data[row_num] = hasSubsequence(haystack_slice.data, haystack_slice.size, needle_slice.data, needle_slice.size);
else
res_data[row_num] = hasSubsequenceUTF8(haystack_slice.data, haystack_slice.size, needle_slice.data, needle_slice.size);
haystacks.next();
needles.next();
}
}
static UInt8 hasSubsequence(const UInt8 * haystack, size_t haystack_size, const UInt8 * needle, size_t needle_size)
{
size_t j = 0;
for (size_t i = 0; (i < haystack_size) && (j < needle_size); i++)
if (Impl::toLowerIfNeed(needle[j]) == Impl::toLowerIfNeed(haystack[i]))
++j;
return j == needle_size;
}
static UInt8 hasSubsequenceUTF8(const UInt8 * haystack, size_t haystack_size, const UInt8 * needle, size_t needle_size)
{
const auto * haystack_pos = haystack;
const auto * needle_pos = needle;
const auto * haystack_end = haystack + haystack_size;
const auto * needle_end = needle + needle_size;
if (!needle_size)
return 1;
auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
if (!haystack_code_point || !needle_code_point)
return 0;
while (haystack_code_point && needle_code_point)
{
if (Impl::toLowerIfNeed(*needle_code_point) == Impl::toLowerIfNeed(*haystack_code_point))
{
needle_pos += UTF8::seqLength(*needle_pos);
if (needle_pos >= needle_end)
break;
needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
}
haystack_pos += UTF8::seqLength(*haystack_pos);
if (haystack_pos >= haystack_end)
break;
haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
}
return needle_pos == needle_end;
}
};
}
}

View File

@ -0,0 +1,30 @@
#include <Functions/FunctionFactory.h>
#include <Functions/HasSubsequenceImpl.h>
namespace DB
{
namespace
{
struct HasSubsequenceCaseSensitiveASCII
{
static constexpr bool is_utf8 = false;
static int toLowerIfNeed(int c) { return c; }
};
struct NameHasSubsequence
{
static constexpr auto name = "hasSubsequence";
};
using FunctionHasSubsequence = HasSubsequenceImpl<NameHasSubsequence, HasSubsequenceCaseSensitiveASCII>;
}
REGISTER_FUNCTION(hasSubsequence)
{
factory.registerFunction<FunctionHasSubsequence>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,29 @@
#include <Functions/FunctionFactory.h>
#include <Functions/HasSubsequenceImpl.h>
namespace DB
{
namespace
{
struct HasSubsequenceCaseInsensitiveASCII
{
static constexpr bool is_utf8 = false;
static int toLowerIfNeed(int c) { return std::tolower(c); }
};
struct NameHasSubsequenceCaseInsensitive
{
static constexpr auto name = "hasSubsequenceCaseInsensitive";
};
using FunctionHasSubsequenceCaseInsensitive = HasSubsequenceImpl<NameHasSubsequenceCaseInsensitive, HasSubsequenceCaseInsensitiveASCII>;
}
REGISTER_FUNCTION(hasSubsequenceCaseInsensitive)
{
factory.registerFunction<FunctionHasSubsequenceCaseInsensitive>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,31 @@
#include <Functions/FunctionFactory.h>
#include <Functions/HasSubsequenceImpl.h>
#include "Poco/Unicode.h"
namespace DB
{
namespace
{
struct HasSubsequenceCaseInsensitiveUTF8
{
static constexpr bool is_utf8 = true;
static int toLowerIfNeed(int code_point) { return Poco::Unicode::toLower(code_point); }
};
struct NameHasSubsequenceCaseInsensitiveUTF8
{
static constexpr auto name = "hasSubsequenceCaseInsensitiveUTF8";
};
using FunctionHasSubsequenceCaseInsensitiveUTF8 = HasSubsequenceImpl<NameHasSubsequenceCaseInsensitiveUTF8, HasSubsequenceCaseInsensitiveUTF8>;
}
REGISTER_FUNCTION(hasSubsequenceCaseInsensitiveUTF8)
{
factory.registerFunction<FunctionHasSubsequenceCaseInsensitiveUTF8>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,30 @@
#include <Functions/FunctionFactory.h>
#include <Functions/HasSubsequenceImpl.h>
namespace DB
{
namespace
{
struct HasSubsequenceCaseSensitiveUTF8
{
static constexpr bool is_utf8 = true;
static int toLowerIfNeed(int code_point) { return code_point; }
};
struct NameHasSubsequenceUTF8
{
static constexpr auto name = "hasSubsequenceUTF8";
};
using FunctionHasSubsequenceUTF8 = HasSubsequenceImpl<NameHasSubsequenceUTF8, HasSubsequenceCaseSensitiveUTF8>;
}
REGISTER_FUNCTION(hasSubsequenceUTF8)
{
factory.registerFunction<FunctionHasSubsequenceUTF8>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -1,4 +1,3 @@
#include "FunctionsStringSearch.h"
#include "FunctionFactory.h"
#include "like.h"

View File

@ -344,6 +344,10 @@ has
hasAll
hasAny
hasColumnInTable
hasSubsequence
hasSubsequenceCaseInsensitive
hasSubsequenceCaseInsensitiveUTF8
hasSubsequenceUTF8
hasSubstr
hasThreadFuzzer
hex

View File

@ -0,0 +1,64 @@
hasSubsequence
1
1
0
1
1
1
1
1
1
1
0
1
0
0
0
1
1
0
1
1
0
1
1
0
hasSubsequenceCaseInsensitive
0
1
1
0
1
1
hasSubsequenceUTF8
1
1
0
1
0
1
0
1
1
0
1
0
1
0
hasSubsequenceCaseInsensitiveUTF8
0
1
1
1
0
1
0
1
0
Nullable
\N
\N
\N
1
1
1

View File

@ -0,0 +1,68 @@
select 'hasSubsequence';
select hasSubsequence('garbage', '');
select hasSubsequence('garbage', 'g');
select hasSubsequence('garbage', 'G');
select hasSubsequence('garbage', 'a');
select hasSubsequence('garbage', 'e');
select hasSubsequence('garbage', 'gr');
select hasSubsequence('garbage', 'ab');
select hasSubsequence('garbage', 'be');
select hasSubsequence('garbage', 'arg');
select hasSubsequence('garbage', 'gra');
select hasSubsequence('garbage', 'rga');
select hasSubsequence('garbage', 'garbage');
select hasSubsequence('garbage', 'garbage1');
select hasSubsequence('garbage', 'arbw');
select hasSubsequence('garbage', 'ARG');
select hasSubsequence('garbage', materialize(''));
select hasSubsequence('garbage', materialize('arg'));
select hasSubsequence('garbage', materialize('arbw'));
select hasSubsequence(materialize('garbage'), '');
select hasSubsequence(materialize('garbage'), 'arg');
select hasSubsequence(materialize('garbage'), 'arbw');
select hasSubsequence(materialize('garbage'), materialize(''));
select hasSubsequence(materialize('garbage'), materialize('arg'));
select hasSubsequence(materialize('garbage'), materialize('garbage1'));
select 'hasSubsequenceCaseInsensitive';
select hasSubsequenceCaseInsensitive('garbage', 'w');
select hasSubsequenceCaseInsensitive('garbage', 'ARG');
select hasSubsequenceCaseInsensitive('GARGAGE', 'arg');
select hasSubsequenceCaseInsensitive(materialize('garbage'), materialize('w'));
select hasSubsequenceCaseInsensitive(materialize('garbage'), materialize('ARG'));
select hasSubsequenceCaseInsensitive(materialize('GARGAGE'), materialize('arg'));
select 'hasSubsequenceUTF8';
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', '');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'C'); -- eng
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'С'); -- cyrilic
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'House');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'house');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'система');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'Система');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', 'ссубд');
select hasSubsequence(materialize('ClickHouse - столбцовая система управления базами данных'), 'субд');
select hasSubsequence(materialize('ClickHouse - столбцовая система управления базами данных'), 'суббд');
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', materialize('стул'));
select hasSubsequence('ClickHouse - столбцовая система управления базами данных', materialize('два стула'));
select hasSubsequence(materialize('ClickHouse - столбцовая система управления базами данных'), materialize('орех'));
select hasSubsequence(materialize('ClickHouse - столбцовая система управления базами данных'), materialize('два ореха'));
select 'hasSubsequenceCaseInsensitiveUTF8';
select hasSubsequenceCaseInsensitiveUTF8('для онлайн обработки аналитических запросов (OLAP)', 'oltp');
select hasSubsequenceCaseInsensitiveUTF8('для онлайн обработки аналитических запросов (OLAP)', 'оОоОоO');
select hasSubsequenceCaseInsensitiveUTF8('для онлайн обработки аналитических запросов (OLAP)', 'я раб');
select hasSubsequenceCaseInsensitiveUTF8(materialize('для онлайн обработки аналитических запросов (OLAP)'), 'работа');
select hasSubsequenceCaseInsensitiveUTF8(materialize('для онлайн обработки аналитических запросов (OLAP)'), 'work');
select hasSubsequenceCaseInsensitiveUTF8('для онлайн обработки аналитических запросов (OLAP)', materialize('добро)'));
select hasSubsequenceCaseInsensitiveUTF8('для онлайн обработки аналитических запросов (OLAP)', materialize('зло()'));
select hasSubsequenceCaseInsensitiveUTF8(materialize('для онлайн обработки аналитических запросов (OLAP)'), materialize('аналитика'));
select hasSubsequenceCaseInsensitiveUTF8(materialize('для онлайн обработки аналитических запросов (OLAP)'), materialize('аналитика для аналитиков'));
select 'Nullable';
select hasSubsequence(Null, Null);
select hasSubsequence(Null, 'a');
select hasSubsequence(Null::Nullable(String), 'arg'::Nullable(String));
select hasSubsequence('garbage'::Nullable(String), 'a');
select hasSubsequence('garbage'::Nullable(String), 'arg'::Nullable(String));
select hasSubsequence(materialize('garbage'::Nullable(String)), materialize('arg'::Nullable(String)));

View File

@ -1539,6 +1539,10 @@ hadoop
halfMD
halfday
hardlinks
hasSubsequence
hasSubsequenceCaseInsensitive
hasSubsequenceCaseInsensitiveUTF
hasSubsequenceUTF
hasAll
hasAny
hasColumnInTable
@ -2247,6 +2251,7 @@ subquery
subranges
subreddits
subseconds
subsequence
substring
substringUTF
substrings