ClickHouse/src/Functions/HasSubsequenceImpl.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

157 lines
5.9 KiB
C++
Raw Normal View History

2023-07-06 13:16:31 +00:00
#pragma once
2023-07-07 13:15:26 +00:00
#include <Columns/ColumnString.h>
#include <Functions/GatherUtils/Sources.h>
#include <Functions/GatherUtils/Sinks.h>
2023-07-10 09:18:09 +00:00
2023-07-06 13:16:31 +00:00
namespace DB
{
2023-07-10 09:18:09 +00:00
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
}
2023-07-06 13:16:31 +00:00
namespace
{
2023-07-07 13:15:26 +00:00
using namespace GatherUtils;
2023-07-06 13:16:31 +00:00
template <typename Name, typename Impl>
2023-07-10 09:18:09 +00:00
class HasSubsequenceImpl : public IFunction
2023-07-06 13:16:31 +00:00
{
2023-07-07 13:15:26 +00:00
public:
2023-07-06 13:16:31 +00:00
static constexpr auto name = Name::name;
2023-07-10 09:18:09 +00:00
static FunctionPtr create(ContextPtr) { return std::make_shared<HasSubsequenceImpl>(); }
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
String getName() const override { return name; }
2023-07-06 19:43:37 +00:00
2023-07-07 13:15:26 +00:00
bool isVariadic() const override { return false; }
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
size_t getNumberOfArguments() const override { return 2; }
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
bool useDefaultImplementationForConstants() const override { return false; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {};}
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
2023-07-06 13:16:31 +00:00
{
2023-07-07 13:15:26 +00:00
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}",
arguments[0]->getName(), getName());
2023-07-06 19:43:37 +00:00
2023-07-07 13:15:26 +00:00
if (!isString(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}",
arguments[1]->getName(), getName());
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
return std::make_shared<DataTypeNumber<UInt8>>();
2023-07-06 13:16:31 +00:00
}
2023-07-07 13:15:26 +00:00
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override
2023-07-06 13:16:31 +00:00
{
2023-07-07 13:15:26 +00:00
const ColumnPtr & column_haystack = arguments[0].column;
const ColumnPtr & column_needle = arguments[1].column;
const ColumnConst * haystack_const_string = checkAndGetColumnConst<ColumnString>(column_haystack.get());
const ColumnConst * needle_const_string = checkAndGetColumnConst<ColumnString>(column_needle.get());
const ColumnString * haystack_string = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnString * needle_string = checkAndGetColumn<ColumnString>(&*column_needle);
auto col_res = ColumnVector<UInt8>::create();
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();
vec_res.resize(input_rows_count);
if (haystack_string && needle_string)
execute(StringSource{*haystack_string}, StringSource{*needle_string}, vec_res);
else if (haystack_string && needle_const_string)
execute(StringSource{*haystack_string}, ConstSource<StringSource>{*needle_const_string}, vec_res);
else if (haystack_const_string && needle_string)
execute(ConstSource<StringSource>{*haystack_const_string}, StringSource{*needle_string}, vec_res);
else if (haystack_const_string && needle_const_string)
execute(ConstSource<StringSource>{*haystack_const_string}, ConstSource<StringSource>{*needle_const_string}, vec_res);
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
2023-07-10 09:18:09 +00:00
"Illegal columns {} and {} of arguments of function {}",
2023-07-07 13:15:26 +00:00
arguments[0].column->getName(),
2023-07-10 09:18:09 +00:00
arguments[1].column->getName(),
2023-07-07 13:15:26 +00:00
getName());
return col_res;
}
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
private:
2023-07-06 13:16:31 +00:00
2023-07-07 13:15:26 +00:00
template <typename SourceHaystack, typename SourceNeedle>
void execute(
SourceHaystack && haystacks,
SourceNeedle && needles,
PaddedPODArray<UInt8> & res_data) const
{
while (!haystacks.isEnd())
2023-07-06 13:16:31 +00:00
{
2023-07-10 09:18:09 +00:00
auto haystack_slice = haystacks.getWhole();
auto needle_slice = needles.getWhole();
size_t row_num = haystacks.rowNum();
2023-07-07 13:15:26 +00:00
2023-07-10 07:29:31 +00:00
if constexpr (!Impl::is_utf8)
2023-07-10 09:18:09 +00:00
res_data[row_num] = hasSubsequence(haystack_slice.data, haystack_slice.size, needle_slice.data, needle_slice.size);
2023-07-10 07:29:31 +00:00
else
2023-07-10 09:18:09 +00:00
res_data[row_num] = hasSubsequenceUTF8(haystack_slice.data, haystack_slice.size, needle_slice.data, needle_slice.size);
2023-07-07 13:15:26 +00:00
haystacks.next();
needles.next();
2023-07-06 13:16:31 +00:00
}
}
2023-07-06 19:43:37 +00:00
2023-07-10 09:18:09 +00:00
static UInt8 hasSubsequence(const UInt8 * haystack, size_t haystack_size, const UInt8 * needle, size_t needle_size)
2023-07-06 19:43:37 +00:00
{
size_t j = 0;
for (size_t i = 0; (i < haystack_size) && (j < needle_size); i++)
2023-07-10 09:18:09 +00:00
if (Impl::toLowerIfNeed(needle[j]) == Impl::toLowerIfNeed(haystack[i]))
2023-07-06 19:43:37 +00:00
++j;
return j == needle_size;
}
2023-07-10 07:29:31 +00:00
2023-07-10 09:18:09 +00:00
static UInt8 hasSubsequenceUTF8(const UInt8 * haystack, size_t haystack_size, const UInt8 * needle, size_t needle_size)
2023-07-10 07:29:31 +00:00
{
const auto * haystack_pos = haystack;
const auto * needle_pos = needle;
const auto * haystack_end = haystack + haystack_size;
const auto * needle_end = needle + needle_size;
if (!needle_size)
return 1;
auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
if (!haystack_code_point || !needle_code_point)
return 0;
2023-07-10 09:18:09 +00:00
while (haystack_code_point && needle_code_point)
2023-07-10 09:33:53 +00:00
{
2023-07-10 09:18:09 +00:00
if (Impl::toLowerIfNeed(*needle_code_point) == Impl::toLowerIfNeed(*haystack_code_point))
2023-07-10 07:29:31 +00:00
{
needle_pos += UTF8::seqLength(*needle_pos);
2023-07-10 09:18:09 +00:00
if (needle_pos >= needle_end)
2023-07-10 07:29:31 +00:00
break;
needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
}
haystack_pos += UTF8::seqLength(*haystack_pos);
2023-07-10 09:18:09 +00:00
if (haystack_pos >= haystack_end)
2023-07-10 07:29:31 +00:00
break;
haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
}
return needle_pos == needle_end;
}
2023-07-06 13:16:31 +00:00
};
}
}