2017-04-21 17:47:27 +00:00
|
|
|
#pragma once
|
2011-10-16 07:11:36 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Columns/ColumnConst.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
2020-03-26 18:55:41 +00:00
|
|
|
#include <Columns/ColumnFixedString.h>
|
2018-09-03 04:57:01 +00:00
|
|
|
#include <Columns/ColumnVector.h>
|
2019-01-14 15:54:47 +00:00
|
|
|
#include <DataTypes/DataTypeArray.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataTypes/DataTypeString.h>
|
2018-09-03 04:57:01 +00:00
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
2017-07-21 06:35:58 +00:00
|
|
|
#include <Functions/FunctionHelpers.h>
|
2021-05-17 07:30:42 +00:00
|
|
|
#include <Functions/IFunction.h>
|
2019-03-26 21:56:46 +00:00
|
|
|
#include <Interpreters/Context.h>
|
2020-08-04 07:05:16 +00:00
|
|
|
#include <IO/WriteHelpers.h>
|
2016-10-24 13:47:15 +00:00
|
|
|
|
2011-10-16 07:11:36 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Search and replace functions in strings:
|
|
|
|
* position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found.
|
|
|
|
* positionUTF8(haystack, needle) - the same, but the position is calculated at code points, provided that the string is encoded in UTF-8.
|
2016-01-27 03:11:28 +00:00
|
|
|
* positionCaseInsensitive(haystack, needle)
|
|
|
|
* positionCaseInsensitiveUTF8(haystack, needle)
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
* like(haystack, pattern) - search by the regular expression LIKE; Returns 0 or 1. Case-insensitive, but only for Latin.
|
2011-10-17 08:28:39 +00:00
|
|
|
* notLike(haystack, pattern)
|
|
|
|
*
|
2022-05-13 08:52:25 +00:00
|
|
|
* ilike(haystack, pattern) - like 'like' but case-insensitive
|
|
|
|
* notIlike(haystack, pattern)
|
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
|
2012-07-21 03:45:48 +00:00
|
|
|
*
|
2020-12-24 10:11:07 +00:00
|
|
|
* countSubstrings(haystack, needle) -- count number of occurrences of needle in haystack.
|
2020-11-26 18:16:07 +00:00
|
|
|
* countSubstringsCaseInsensitive(haystack, needle)
|
2022-05-13 08:52:25 +00:00
|
|
|
* countSubstringsCaseInsensitiveUTF8(haystack, needle)
|
|
|
|
*
|
|
|
|
* hasToken()
|
|
|
|
* hasTokenCaseInsensitive()
|
|
|
|
*
|
|
|
|
* JSON stuff:
|
|
|
|
* visitParamExtractBool()
|
|
|
|
* simpleJSONExtractBool()
|
|
|
|
* visitParamExtractFloat()
|
|
|
|
* simpleJSONExtractFloat()
|
|
|
|
* visitParamExtractInt()
|
|
|
|
* simpleJSONExtractInt()
|
|
|
|
* visitParamExtractUInt()
|
|
|
|
* simpleJSONExtractUInt()
|
|
|
|
* visitParamHas()
|
|
|
|
* simpleJSONHas()
|
2020-11-26 18:16:07 +00:00
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
* Applies regexp re2 and pulls:
|
|
|
|
* - the first subpattern, if the regexp has a subpattern;
|
|
|
|
* - the zero subpattern (the match part, otherwise);
|
|
|
|
* - if not match - an empty string.
|
2013-03-18 10:49:31 +00:00
|
|
|
* extract(haystack, pattern)
|
2011-10-16 07:11:36 +00:00
|
|
|
*/
|
|
|
|
|
2018-09-03 04:57:01 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
2020-08-04 07:05:16 +00:00
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2018-09-03 04:57:01 +00:00
|
|
|
}
|
2011-10-16 07:11:36 +00:00
|
|
|
|
2021-09-21 16:43:46 +00:00
|
|
|
template <typename Impl>
|
2017-03-10 17:52:36 +00:00
|
|
|
class FunctionsStringSearch : public IFunction
|
2014-01-27 13:49:06 +00:00
|
|
|
{
|
|
|
|
public:
|
2021-09-21 16:43:46 +00:00
|
|
|
static constexpr auto name = Impl::name;
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionsStringSearch>(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-01-14 15:54:47 +00:00
|
|
|
String getName() const override { return name; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-08-02 14:24:39 +00:00
|
|
|
bool isVariadic() const override { return Impl::supports_start_pos; }
|
2020-08-01 21:14:23 +00:00
|
|
|
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2021-04-29 14:48:26 +00:00
|
|
|
|
2020-08-02 14:24:39 +00:00
|
|
|
size_t getNumberOfArguments() const override
|
|
|
|
{
|
|
|
|
if (Impl::supports_start_pos)
|
2020-08-01 21:14:23 +00:00
|
|
|
return 0;
|
|
|
|
return 2;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-02-17 18:53:59 +00:00
|
|
|
bool useDefaultImplementationForConstants() const override { return Impl::use_default_implementation_for_constants; }
|
|
|
|
|
|
|
|
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override
|
|
|
|
{
|
2022-05-16 20:23:51 +00:00
|
|
|
return Impl::getArgumentsThatAreAlwaysConstant();
|
2020-02-17 18:53:59 +00:00
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
|
|
|
{
|
2020-08-04 07:05:16 +00:00
|
|
|
if (arguments.size() < 2 || 3 < arguments.size())
|
2021-09-21 16:43:46 +00:00
|
|
|
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
|
2020-08-04 07:05:16 +00:00
|
|
|
+ toString(arguments.size()) + ", should be 2 or 3.",
|
|
|
|
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
|
|
|
|
2020-03-26 19:12:34 +00:00
|
|
|
if (!isStringOrFixedString(arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception(
|
|
|
|
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[1]))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception(
|
|
|
|
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
|
2020-08-02 14:24:39 +00:00
|
|
|
if (arguments.size() >= 3)
|
|
|
|
{
|
|
|
|
if (!isUnsignedInteger(arguments[2]))
|
2020-08-01 21:14:23 +00:00
|
|
|
throw Exception(
|
|
|
|
"Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
|
|
|
|
}
|
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnPtr & column_haystack = arguments[0].column;
|
|
|
|
const ColumnPtr & column_needle = arguments[1].column;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-08-01 21:14:23 +00:00
|
|
|
ColumnPtr column_start_pos = nullptr;
|
2020-08-02 14:24:39 +00:00
|
|
|
if (arguments.size() >= 3)
|
2020-10-18 19:00:13 +00:00
|
|
|
column_start_pos = arguments[2].column;
|
2020-08-01 21:14:23 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
const ColumnConst * col_haystack_const = typeid_cast<const ColumnConst *>(&*column_haystack);
|
|
|
|
const ColumnConst * col_needle_const = typeid_cast<const ColumnConst *>(&*column_needle);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
using ResultType = typename Impl::ResultType;
|
|
|
|
|
2020-02-17 18:53:59 +00:00
|
|
|
if constexpr (!Impl::use_default_implementation_for_constants)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-08-01 21:14:23 +00:00
|
|
|
bool is_col_start_pos_const = column_start_pos == nullptr || isColumnConst(*column_start_pos);
|
2020-02-17 18:53:59 +00:00
|
|
|
if (col_haystack_const && col_needle_const)
|
|
|
|
{
|
2020-08-01 21:14:23 +00:00
|
|
|
auto col_res = ColumnVector<ResultType>::create();
|
|
|
|
typename ColumnVector<ResultType>::Container & vec_res = col_res->getData();
|
|
|
|
vec_res.resize(is_col_start_pos_const ? 1 : column_start_pos->size());
|
|
|
|
|
|
|
|
Impl::constantConstant(
|
|
|
|
col_haystack_const->getValue<String>(),
|
|
|
|
col_needle_const->getValue<String>(),
|
|
|
|
column_start_pos,
|
|
|
|
vec_res);
|
|
|
|
|
2020-08-02 14:24:39 +00:00
|
|
|
if (is_col_start_pos_const)
|
2020-10-18 19:00:13 +00:00
|
|
|
return result_type->createColumnConst(col_haystack_const->size(), toField(vec_res[0]));
|
2020-08-02 14:24:39 +00:00
|
|
|
else
|
2020-10-18 19:00:13 +00:00
|
|
|
return col_res;
|
2020-02-17 18:53:59 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2017-12-14 01:43:19 +00:00
|
|
|
auto col_res = ColumnVector<ResultType>::create();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-15 21:32:25 +00:00
|
|
|
typename ColumnVector<ResultType>::Container & vec_res = col_res->getData();
|
2017-04-01 07:20:54 +00:00
|
|
|
vec_res.resize(column_haystack->size());
|
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
|
2020-03-26 18:55:41 +00:00
|
|
|
const ColumnFixedString * col_haystack_vector_fixed = checkAndGetColumn<ColumnFixedString>(&*column_haystack);
|
2017-07-21 06:35:58 +00:00
|
|
|
const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (col_haystack_vector && col_needle_vector)
|
2020-03-23 02:12:31 +00:00
|
|
|
Impl::vectorVector(
|
2019-01-14 15:54:47 +00:00
|
|
|
col_haystack_vector->getChars(),
|
2017-04-01 07:20:54 +00:00
|
|
|
col_haystack_vector->getOffsets(),
|
|
|
|
col_needle_vector->getChars(),
|
|
|
|
col_needle_vector->getOffsets(),
|
2020-08-01 21:14:23 +00:00
|
|
|
column_start_pos,
|
2017-04-01 07:20:54 +00:00
|
|
|
vec_res);
|
|
|
|
else if (col_haystack_vector && col_needle_const)
|
2020-03-23 02:12:31 +00:00
|
|
|
Impl::vectorConstant(
|
2020-08-01 21:14:23 +00:00
|
|
|
col_haystack_vector->getChars(),
|
|
|
|
col_haystack_vector->getOffsets(),
|
|
|
|
col_needle_const->getValue<String>(),
|
|
|
|
column_start_pos,
|
|
|
|
vec_res);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
else if (col_haystack_vector_fixed && col_needle_vector)
|
|
|
|
Impl::vectorFixedVector(
|
|
|
|
col_haystack_vector_fixed->getChars(),
|
|
|
|
col_haystack_vector_fixed->getN(),
|
|
|
|
col_needle_vector->getChars(),
|
|
|
|
col_needle_vector->getOffsets(),
|
|
|
|
column_start_pos,
|
|
|
|
vec_res);
|
2020-03-26 18:55:41 +00:00
|
|
|
else if (col_haystack_vector_fixed && col_needle_const)
|
|
|
|
Impl::vectorFixedConstant(
|
2020-08-01 21:14:23 +00:00
|
|
|
col_haystack_vector_fixed->getChars(),
|
|
|
|
col_haystack_vector_fixed->getN(),
|
|
|
|
col_needle_const->getValue<String>(),
|
|
|
|
vec_res);
|
2017-04-01 07:20:54 +00:00
|
|
|
else if (col_haystack_const && col_needle_vector)
|
2020-03-23 02:12:31 +00:00
|
|
|
Impl::constantVector(
|
2020-08-01 21:14:23 +00:00
|
|
|
col_haystack_const->getValue<String>(),
|
|
|
|
col_needle_vector->getChars(),
|
|
|
|
col_needle_vector->getOffsets(),
|
|
|
|
column_start_pos,
|
|
|
|
vec_res);
|
2017-04-01 07:20:54 +00:00
|
|
|
else
|
2019-01-14 15:54:47 +00:00
|
|
|
throw Exception(
|
2020-10-18 19:00:13 +00:00
|
|
|
"Illegal columns " + arguments[0].column->getName() + " and "
|
|
|
|
+ arguments[1].column->getName() + " of arguments of function " + getName(),
|
2017-04-01 07:20:54 +00:00
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
2017-12-16 05:21:04 +00:00
|
|
|
|
2020-10-18 19:00:13 +00:00
|
|
|
return col_res;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2011-10-16 07:11:36 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|