mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 01:00:48 +00:00
Merge pull request #29981 from kitaisreal/added-function-tokens
Added function tokens
This commit is contained in:
commit
4ed28d8e52
@ -28,7 +28,7 @@ The function also works for [arrays](array-functions.md#function-empty) or [UUID
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Returns `1` for an empty string or `0` for a non-empty string.
|
||||
- Returns `1` for an empty string or `0` for a non-empty string.
|
||||
|
||||
Type: [UInt8](../data-types/int-uint.md).
|
||||
|
||||
@ -68,7 +68,7 @@ The function also works for [arrays](array-functions.md#function-notempty) or [U
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Returns `1` for a non-empty string or `0` for an empty string string.
|
||||
- Returns `1` for a non-empty string or `0` for an empty string string.
|
||||
|
||||
Type: [UInt8](../data-types/int-uint.md).
|
||||
|
||||
@ -313,6 +313,32 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b');
|
||||
└───────────────────────┘
|
||||
```
|
||||
|
||||
## tokens {#tokens}
|
||||
|
||||
Split string into tokens using non-alpha numeric ASCII characters as separators.
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `input_string` — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object.
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The resulting array of tokens from input string.
|
||||
|
||||
Type: [Array](../data-types/array.md).
|
||||
|
||||
**Example**
|
||||
|
||||
``` sql
|
||||
SELECT tokens('test1,;\\ test2,;\\ test3,;\\ test4') AS tokens;
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─tokens────────────────────────────┐
|
||||
│ ['test1','test2','test3','test4'] │
|
||||
└───────────────────────────────────┘
|
||||
```
|
||||
|
||||
## repeat {#repeat}
|
||||
|
||||
Repeats a string as many times as specified and concatenates the replicated values as a single string.
|
||||
|
@ -1,126 +0,0 @@
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeFixedString.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Interpreters/ITokenExtractor.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
class FunctionNgrams : public IFunction
|
||||
{
|
||||
public:
|
||||
|
||||
static constexpr auto name = "ngrams";
|
||||
|
||||
static FunctionPtr create(ContextPtr)
|
||||
{
|
||||
return std::make_shared<FunctionNgrams>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
bool isVariadic() const override { return false; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
bool useDefaultImplementationForNulls() const override { return true; }
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
bool useDefaultImplementationForLowCardinalityColumns() const override { return true; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
auto ngram_input_argument_type = WhichDataType(arguments[0].type);
|
||||
if (!ngram_input_argument_type.isStringOrFixedString())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Function {} second argument type should be String or FixedString. Actual {}",
|
||||
getName(),
|
||||
arguments[0].type->getName());
|
||||
|
||||
const auto & column_with_type = arguments[1];
|
||||
const auto & ngram_argument_column = arguments[1].column;
|
||||
auto ngram_argument_type = WhichDataType(column_with_type.type);
|
||||
|
||||
if (!ngram_argument_type.isNativeUInt() || !ngram_argument_column || !isColumnConst(*ngram_argument_column))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Function {} second argument type should be constant UInt. Actual {}",
|
||||
getName(),
|
||||
arguments[1].type->getName());
|
||||
|
||||
Field ngram_argument_value;
|
||||
ngram_argument_column->get(0, ngram_argument_value);
|
||||
auto ngram_value = ngram_argument_value.safeGet<UInt64>();
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFixedString>(ngram_value));
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
|
||||
{
|
||||
Field ngram_argument_value;
|
||||
arguments[1].column->get(0, ngram_argument_value);
|
||||
auto ngram_value = ngram_argument_value.safeGet<UInt64>();
|
||||
|
||||
NgramTokenExtractor extractor(ngram_value);
|
||||
|
||||
auto result_column_fixed_string = ColumnFixedString::create(ngram_value);
|
||||
auto column_offsets = ColumnArray::ColumnOffsets::create();
|
||||
|
||||
auto input_column = arguments[0].column;
|
||||
if (const auto * column_string = checkAndGetColumn<ColumnString>(input_column.get()))
|
||||
executeImpl(extractor, *column_string, *result_column_fixed_string, *column_offsets);
|
||||
else if (const auto * column_fixed_string = checkAndGetColumn<ColumnFixedString>(input_column.get()))
|
||||
executeImpl(extractor, *column_fixed_string, *result_column_fixed_string, *column_offsets);
|
||||
|
||||
return ColumnArray::create(std::move(result_column_fixed_string), std::move(column_offsets));
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
template <typename StringColumnType>
|
||||
inline void executeImpl(const NgramTokenExtractor & extractor, StringColumnType & input_data_column, ColumnFixedString & result_data_column, ColumnArray::ColumnOffsets & offsets_column) const
|
||||
{
|
||||
size_t current_tokens_size = 0;
|
||||
auto & offsets_data = offsets_column.getData();
|
||||
|
||||
size_t column_size = input_data_column.size();
|
||||
offsets_data.resize(column_size);
|
||||
|
||||
for (size_t i = 0; i < column_size; ++i)
|
||||
{
|
||||
auto data = input_data_column.getDataAt(i);
|
||||
|
||||
size_t cur = 0;
|
||||
size_t token_start = 0;
|
||||
size_t token_length = 0;
|
||||
|
||||
while (cur < data.size && extractor.nextInString(data.data, data.size, &cur, &token_start, &token_length))
|
||||
{
|
||||
result_data_column.insertData(data.data + token_start, token_length);
|
||||
++current_tokens_size;
|
||||
}
|
||||
|
||||
offsets_data[i] = current_tokens_size;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionNgrams(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionNgrams>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,7 @@ void registerFunctionsStringArray(FunctionFactory &);
|
||||
void registerFunctionsStringSearch(FunctionFactory &);
|
||||
void registerFunctionsStringRegexp(FunctionFactory &);
|
||||
void registerFunctionsStringSimilarity(FunctionFactory &);
|
||||
void registerFunctionNgrams(FunctionFactory &);
|
||||
void registerFunctionsStringTokenExtractor(FunctionFactory &);
|
||||
void registerFunctionsURL(FunctionFactory &);
|
||||
void registerFunctionsVisitParam(FunctionFactory &);
|
||||
void registerFunctionsMath(FunctionFactory &);
|
||||
@ -101,7 +101,7 @@ void registerFunctions()
|
||||
registerFunctionsStringSearch(factory);
|
||||
registerFunctionsStringRegexp(factory);
|
||||
registerFunctionsStringSimilarity(factory);
|
||||
registerFunctionNgrams(factory);
|
||||
registerFunctionsStringTokenExtractor(factory);
|
||||
registerFunctionsURL(factory);
|
||||
registerFunctionsVisitParam(factory);
|
||||
registerFunctionsMath(factory);
|
||||
|
166
src/Functions/tokenExtractors.cpp
Normal file
166
src/Functions/tokenExtractors.cpp
Normal file
@ -0,0 +1,166 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeFixedString.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Interpreters/ITokenExtractor.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
enum TokenExtractorStrategy
|
||||
{
|
||||
ngrams,
|
||||
tokens
|
||||
};
|
||||
|
||||
template <TokenExtractorStrategy strategy>
|
||||
class FunctionTokenExtractor : public IFunction
|
||||
{
|
||||
public:
|
||||
|
||||
static constexpr auto name = strategy == ngrams ? "ngrams" : "tokens";
|
||||
|
||||
static FunctionPtr create(ContextPtr)
|
||||
{
|
||||
return std::make_shared<FunctionTokenExtractor>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return strategy == ngrams ? 2 : 1; }
|
||||
bool isVariadic() const override { return false; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return strategy == ngrams ? ColumnNumbers{1} : ColumnNumbers{}; }
|
||||
|
||||
bool useDefaultImplementationForNulls() const override { return true; }
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
bool useDefaultImplementationForLowCardinalityColumns() const override { return true; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
auto ngram_input_argument_type = WhichDataType(arguments[0].type);
|
||||
if (!ngram_input_argument_type.isStringOrFixedString())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Function {} first argument type should be String or FixedString. Actual {}",
|
||||
getName(),
|
||||
arguments[0].type->getName());
|
||||
|
||||
if constexpr (strategy == ngrams)
|
||||
{
|
||||
const auto & column_with_type = arguments[1];
|
||||
const auto & ngram_argument_column = arguments[1].column;
|
||||
auto ngram_argument_type = WhichDataType(column_with_type.type);
|
||||
|
||||
if (!ngram_argument_type.isNativeUInt() || !ngram_argument_column || !isColumnConst(*ngram_argument_column))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Function {} second argument type should be constant UInt. Actual {}",
|
||||
getName(),
|
||||
arguments[1].type->getName());
|
||||
|
||||
Field ngram_argument_value;
|
||||
ngram_argument_column->get(0, ngram_argument_value);
|
||||
auto ngram_value = ngram_argument_value.safeGet<UInt64>();
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFixedString>(ngram_value));
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
||||
}
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
|
||||
{
|
||||
auto column_offsets = ColumnArray::ColumnOffsets::create();
|
||||
|
||||
if constexpr (strategy == TokenExtractorStrategy::ngrams)
|
||||
{
|
||||
Field ngram_argument_value;
|
||||
arguments[1].column->get(0, ngram_argument_value);
|
||||
auto ngram_value = ngram_argument_value.safeGet<UInt64>();
|
||||
|
||||
NgramTokenExtractor extractor(ngram_value);
|
||||
|
||||
auto result_column_fixed_string = ColumnFixedString::create(ngram_value);
|
||||
|
||||
auto input_column = arguments[0].column;
|
||||
|
||||
if (const auto * column_string = checkAndGetColumn<ColumnString>(input_column.get()))
|
||||
executeImpl(extractor, *column_string, *result_column_fixed_string, *column_offsets);
|
||||
else if (const auto * column_fixed_string = checkAndGetColumn<ColumnFixedString>(input_column.get()))
|
||||
executeImpl(extractor, *column_fixed_string, *result_column_fixed_string, *column_offsets);
|
||||
|
||||
return ColumnArray::create(std::move(result_column_fixed_string), std::move(column_offsets));
|
||||
}
|
||||
else
|
||||
{
|
||||
SplitTokenExtractor extractor;
|
||||
|
||||
auto result_column_string = ColumnString::create();
|
||||
|
||||
auto input_column = arguments[0].column;
|
||||
|
||||
if (const auto * column_string = checkAndGetColumn<ColumnString>(input_column.get()))
|
||||
executeImpl(extractor, *column_string, *result_column_string, *column_offsets);
|
||||
else if (const auto * column_fixed_string = checkAndGetColumn<ColumnFixedString>(input_column.get()))
|
||||
executeImpl(extractor, *column_fixed_string, *result_column_string, *column_offsets);
|
||||
|
||||
return ColumnArray::create(std::move(result_column_string), std::move(column_offsets));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
template <typename ExtractorType, typename StringColumnType, typename ResultStringColumnType>
|
||||
inline void executeImpl(
|
||||
const ExtractorType & extractor,
|
||||
StringColumnType & input_data_column,
|
||||
ResultStringColumnType & result_data_column,
|
||||
ColumnArray::ColumnOffsets & offsets_column) const
|
||||
{
|
||||
size_t current_tokens_size = 0;
|
||||
auto & offsets_data = offsets_column.getData();
|
||||
|
||||
size_t column_size = input_data_column.size();
|
||||
offsets_data.resize(column_size);
|
||||
|
||||
for (size_t i = 0; i < column_size; ++i)
|
||||
{
|
||||
auto data = input_data_column.getDataAt(i);
|
||||
|
||||
size_t cur = 0;
|
||||
size_t token_start = 0;
|
||||
size_t token_length = 0;
|
||||
|
||||
while (cur < data.size && extractor.nextInString(data.data, data.size, &cur, &token_start, &token_length))
|
||||
{
|
||||
result_data_column.insertData(data.data + token_start, token_length);
|
||||
++current_tokens_size;
|
||||
}
|
||||
|
||||
offsets_data[i] = current_tokens_size;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionsStringTokenExtractor(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionTokenExtractor<TokenExtractorStrategy::ngrams>>();
|
||||
factory.registerFunction<FunctionTokenExtractor<TokenExtractorStrategy::tokens>>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
16
tests/queries/0_stateless/02028_tokens.reference
Normal file
16
tests/queries/0_stateless/02028_tokens.reference
Normal file
@ -0,0 +1,16 @@
|
||||
['test']
|
||||
['test1','test2','test3']
|
||||
['test1','test2','test3','test4']
|
||||
['test1','test2','test3','test4']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['test']
|
||||
['test1','test2','test3']
|
||||
['test1','test2','test3','test4']
|
||||
['test1','test2','test3','test4']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
||||
['ё','ё','జ్ఞా','本気ですか','ﷺ','ᾂ','ΐ','שּ']
|
17
tests/queries/0_stateless/02028_tokens.sql
Normal file
17
tests/queries/0_stateless/02028_tokens.sql
Normal file
@ -0,0 +1,17 @@
|
||||
SELECT tokens('test');
|
||||
SELECT tokens('test1, test2, test3');
|
||||
SELECT tokens('test1, test2, test3, test4');
|
||||
SELECT tokens('test1,;\ test2,;\ test3,;\ test4');
|
||||
SELECT tokens('ё ё జ్ఞా 本気ですか ﷺ ᾂ ΐ שּ');
|
||||
SELECT tokens('ё, ё, జ్ఞా, 本気ですか, ﷺ, ᾂ, ΐ, שּ');
|
||||
SELECT tokens('ё, ё, జ్ఞా, 本気ですか, ﷺ, ᾂ, ΐ, שּ');
|
||||
SELECT tokens('ё;\ ё;\ జ్ఞా;\ 本気ですか;\ ﷺ;\ ᾂ;\ ΐ;\ שּ');
|
||||
|
||||
SELECT tokens(materialize('test'));
|
||||
SELECT tokens(materialize('test1, test2, test3'));
|
||||
SELECT tokens(materialize('test1, test2, test3, test4'));
|
||||
SELECT tokens(materialize('test1,;\ test2,;\ test3,;\ test4'));
|
||||
SELECT tokens(materialize('ё ё జ్ఞా 本気ですか ﷺ ᾂ ΐ שּ'));
|
||||
SELECT tokens(materialize('ё, ё, జ్ఞా, 本気ですか, ﷺ, ᾂ, ΐ, שּ'));
|
||||
SELECT tokens(materialize('ё, ё, జ్ఞా, 本気ですか, ﷺ, ᾂ, ΐ, שּ'));
|
||||
SELECT tokens(materialize('ё;\ ё;\ జ్ఞా;\ 本気ですか;\ ﷺ;\ ᾂ;\ ΐ;\ שּ'));
|
Loading…
Reference in New Issue
Block a user