Merge pull request #56375 from rschu1ze/idatatype-convenience-functions

Cleanup convenience functions in IDataType
This commit is contained in:
Robert Schulze 2023-11-07 16:23:45 +01:00 committed by GitHub
commit 4db2e25ca2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 148 additions and 211 deletions

View File

@ -1371,6 +1371,86 @@ Result:
└──────────────────┘
```
## byteHammingDistance
Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings.
**Syntax**
```sql
byteHammingDistance(string1, string2)
```
**Examples**
``` sql
SELECT byteHammingDistance('karolin', 'kathrin');
```
Result:
``` text
┌─byteHammingDistance('karolin', 'kathrin')─┐
│ 3 │
└───────────────────────────────────────────┘
```
Alias: mismatches
## stringJaccardIndex
Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.
**Syntax**
```sql
stringJaccardIndex(string1, string2)
```
**Examples**
``` sql
SELECT stringJaccardIndex('clickhouse', 'mouse');
```
Result:
``` text
┌─stringJaccardIndex('clickhouse', 'mouse')─┐
│ 0.4 │
└───────────────────────────────────────────┘
```
## stringJaccardIndexUTF8
Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings.
## editDistance
Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings.
**Syntax**
```sql
editDistance(string1, string2)
```
**Examples**
``` sql
SELECT editDistance('clickhouse', 'mouse');
```
Result:
``` text
┌─editDistance('clickhouse', 'mouse')─┐
│ 6 │
└─────────────────────────────────────┘
```
Alias: levenshteinDistance
## initcap
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.

View File

@ -681,79 +681,3 @@ Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are U
## hasSubsequenceCaseInsensitiveUTF8
Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively.
## byteHammingDistance
Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings.
**Syntax**
```sql
byteHammingDistance(string1, string2)
```
**Examples**
``` sql
SELECT byteHammingDistance('karolin', 'kathrin');
```
Result:
``` text
┌─byteHammingDistance('karolin', 'kathrin')─┐
│ 3 │
└───────────────────────────────────────────┘
```
Alias: mismatches
## stringJaccardIndex
Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.
**Syntax**
```sql
stringJaccardIndex(string1, string2)
```
**Examples**
``` sql
SELECT stringJaccardIndex('clickhouse', 'mouse');
```
Result:
``` text
┌─stringJaccardIndex('clickhouse', 'mouse')─┐
│ 0.4 │
└───────────────────────────────────────────┘
```
## editDistance
Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings.
**Syntax**
```sql
editDistance(string1, string2)
```
**Examples**
``` sql
SELECT editDistance('clickhouse', 'mouse');
```
Result:
``` text
┌─editDistance('clickhouse', 'mouse')─┐
│ 6 │
└─────────────────────────────────────┘
```
Alias: levenshteinDistance

View File

@ -1106,7 +1106,7 @@ public:
{
if (isInteger(data_type))
{
if (isUnsignedInteger(data_type))
if (isUInt(data_type))
return std::make_unique<UnsignedIntegerModel>(seed);
else
return std::make_unique<SignedIntegerModel>(seed);

View File

@ -84,7 +84,7 @@ public:
}
}
if (!isUnsignedInteger(arguments[1]))
if (!isUInt(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument of aggregate function {} must be unsigned integer.", getName());
if (default_value.isNull())

View File

@ -238,7 +238,7 @@ public:
if constexpr (has_second_arg)
{
assertBinary(Name::name, types);
if (!isUnsignedInteger(types[1]))
if (!isUInt(types[1]))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Second argument (weight) for function {} must be unsigned integer, but it has type {}",

View File

@ -363,6 +363,9 @@ struct WhichDataType
constexpr bool isNativeInt() const { return isInt8() || isInt16() || isInt32() || isInt64(); }
constexpr bool isInt() const { return isNativeInt() || isInt128() || isInt256(); }
constexpr bool isNativeInteger() const { return isNativeInt() || isNativeUInt(); }
constexpr bool isInteger() const { return isInt() || isUInt(); }
constexpr bool isDecimal32() const { return idx == TypeIndex::Decimal32; }
constexpr bool isDecimal64() const { return idx == TypeIndex::Decimal64; }
constexpr bool isDecimal128() const { return idx == TypeIndex::Decimal128; }
@ -373,6 +376,9 @@ struct WhichDataType
constexpr bool isFloat64() const { return idx == TypeIndex::Float64; }
constexpr bool isFloat() const { return isFloat32() || isFloat64(); }
constexpr bool isNativeNumber() const { return isNativeInteger() || isFloat(); }
constexpr bool isNumber() const { return isInteger() || isFloat() || isDecimal(); }
constexpr bool isEnum8() const { return idx == TypeIndex::Enum8; }
constexpr bool isEnum16() const { return idx == TypeIndex::Enum16; }
constexpr bool isEnum() const { return isEnum8() || isEnum16(); }
@ -410,110 +416,60 @@ struct WhichDataType
/// IDataType helpers (alternative for IDataType virtual methods with single point of truth)
template <typename T>
inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); }
template <typename T>
inline bool isDate32(const T & data_type) { return WhichDataType(data_type).isDate32(); }
template <typename T>
inline bool isDateOrDate32(const T & data_type) { return WhichDataType(data_type).isDateOrDate32(); }
template <typename T>
inline bool isDateTime(const T & data_type) { return WhichDataType(data_type).isDateTime(); }
template <typename T>
inline bool isDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTime64(); }
template <typename T>
inline bool isDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTimeOrDateTime64(); }
template <typename T>
inline bool isDateOrDate32OrDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateOrDate32OrDateTimeOrDateTime64(); }
template <typename T> inline bool isUInt8(const T & data_type) { return WhichDataType(data_type).isUInt8(); }
template <typename T> inline bool isUInt16(const T & data_type) { return WhichDataType(data_type).isUInt16(); }
template <typename T> inline bool isUInt32(const T & data_type) { return WhichDataType(data_type).isUInt32(); }
template <typename T> inline bool isUInt64(const T & data_type) { return WhichDataType(data_type).isUInt64(); }
template <typename T> inline bool isNativeUInt(const T & data_type) { return WhichDataType(data_type).isNativeUInt(); }
template <typename T> inline bool isUInt(const T & data_type) { return WhichDataType(data_type).isUInt(); }
template <typename T>
inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); }
template <typename T>
inline bool isDecimal(const T & data_type) { return WhichDataType(data_type).isDecimal(); }
template <typename T>
inline bool isTuple(const T & data_type) { return WhichDataType(data_type).isTuple(); }
template <typename T>
inline bool isArray(const T & data_type) { return WhichDataType(data_type).isArray(); }
template <typename T>
inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); }
template <typename T>
inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); }
template <typename T>
inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); }
template <typename T>
inline bool isUUID(const T & data_type) { return WhichDataType(data_type).isUUID(); }
template <typename T>
inline bool isIPv4(const T & data_type) { return WhichDataType(data_type).isIPv4(); }
template <typename T>
inline bool isIPv6(const T & data_type) { return WhichDataType(data_type).isIPv6(); }
template <typename T> inline bool isInt8(const T & data_type) { return WhichDataType(data_type).isInt8(); }
template <typename T> inline bool isInt16(const T & data_type) { return WhichDataType(data_type).isInt16(); }
template <typename T> inline bool isInt32(const T & data_type) { return WhichDataType(data_type).isInt32(); }
template <typename T> inline bool isInt64(const T & data_type) { return WhichDataType(data_type).isInt64(); }
template <typename T> inline bool isNativeInt(const T & data_type) { return WhichDataType(data_type).isNativeInt(); }
template <typename T> inline bool isInt(const T & data_type) { return WhichDataType(data_type).isInt(); }
template <typename T>
inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); }
template <typename T> inline bool isInteger(const T & data_type) { return WhichDataType(data_type).isInteger(); }
template <typename T> inline bool isNativeInteger(const T & data_type) { return WhichDataType(data_type).isNativeInteger(); }
template <typename T>
inline bool isUInt8(const T & data_type) { return WhichDataType(data_type).isUInt8(); }
template <typename T>
inline bool isUInt16(const T & data_type) { return WhichDataType(data_type).isUInt16(); }
template <typename T>
inline bool isUInt32(const T & data_type) { return WhichDataType(data_type).isUInt32(); }
template <typename T>
inline bool isUInt64(const T & data_type) { return WhichDataType(data_type).isUInt64(); }
template <typename T>
inline bool isNativeUnsignedInteger(const T & data_type) { return WhichDataType(data_type).isNativeUInt(); }
template <typename T>
inline bool isUnsignedInteger(const T & data_type) { return WhichDataType(data_type).isUInt(); }
template <typename T> inline bool isDecimal(const T & data_type) { return WhichDataType(data_type).isDecimal(); }
template <typename T>
inline bool isInt8(const T & data_type) { return WhichDataType(data_type).isInt8(); }
template <typename T>
inline bool isInt16(const T & data_type) { return WhichDataType(data_type).isInt16(); }
template <typename T>
inline bool isInt32(const T & data_type) { return WhichDataType(data_type).isInt32(); }
template <typename T>
inline bool isInt64(const T & data_type) { return WhichDataType(data_type).isInt64(); }
template <typename T>
inline bool isInt(const T & data_type) { return WhichDataType(data_type).isInt(); }
template <typename T> inline bool isFloat(const T & data_type) { return WhichDataType(data_type).isFloat(); }
template <typename T>
inline bool isInteger(const T & data_type)
{
WhichDataType which(data_type);
return which.isInt() || which.isUInt();
}
template <typename T> inline bool isNativeNumber(const T & data_type) { return WhichDataType(data_type).isNativeNumber(); }
template <typename T> inline bool isNumber(const T & data_type) { return WhichDataType(data_type).isNumber(); }
template <typename T>
inline bool isFloat(const T & data_type)
{
WhichDataType which(data_type);
return which.isFloat();
}
template <typename T> inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); }
template <typename T>
inline bool isNativeInteger(const T & data_type)
{
WhichDataType which(data_type);
return which.isNativeInt() || which.isNativeUInt();
}
template <typename T> inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); }
template <typename T> inline bool isDate32(const T & data_type) { return WhichDataType(data_type).isDate32(); }
template <typename T> inline bool isDateOrDate32(const T & data_type) { return WhichDataType(data_type).isDateOrDate32(); }
template <typename T> inline bool isDateTime(const T & data_type) { return WhichDataType(data_type).isDateTime(); }
template <typename T> inline bool isDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTime64(); }
template <typename T> inline bool isDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTimeOrDateTime64(); }
template <typename T> inline bool isDateOrDate32OrDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateOrDate32OrDateTimeOrDateTime64(); }
template <typename T> inline bool isString(const T & data_type) { return WhichDataType(data_type).isString(); }
template <typename T> inline bool isFixedString(const T & data_type) { return WhichDataType(data_type).isFixedString(); }
template <typename T> inline bool isStringOrFixedString(const T & data_type) { return WhichDataType(data_type).isStringOrFixedString(); }
template <typename T>
inline bool isNativeNumber(const T & data_type)
{
WhichDataType which(data_type);
return which.isNativeInt() || which.isNativeUInt() || which.isFloat();
}
template <typename T> inline bool isUUID(const T & data_type) { return WhichDataType(data_type).isUUID(); }
template <typename T> inline bool isIPv4(const T & data_type) { return WhichDataType(data_type).isIPv4(); }
template <typename T> inline bool isIPv6(const T & data_type) { return WhichDataType(data_type).isIPv6(); }
template <typename T> inline bool isArray(const T & data_type) { return WhichDataType(data_type).isArray(); }
template <typename T> inline bool isTuple(const T & data_type) { return WhichDataType(data_type).isTuple(); }
template <typename T> inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); }
template <typename T> inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); }
template <typename T> inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); }
template <typename T>
inline bool isNumber(const T & data_type)
{
WhichDataType which(data_type);
return which.isInt() || which.isUInt() || which.isFloat() || which.isDecimal();
}
template <typename T> inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); }
template <typename T>
inline bool isColumnedAsNumber(const T & data_type)
{
WhichDataType which(data_type);
return which.isInt() || which.isUInt() || which.isFloat() || which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() || which.isUUID() || which.isIPv4() || which.isIPv6();
return which.isInteger() || which.isFloat() || which.isDateOrDate32OrDateTimeOrDateTime64() || which.isUUID() || which.isIPv4() || which.isIPv6();
}
template <typename T>
@ -531,24 +487,6 @@ inline bool isColumnedAsDecimalT(const DataType & data_type)
return (which.isDecimal() || which.isDateTime64()) && which.idx == TypeToTypeIndex<T>;
}
template <typename T>
inline bool isString(const T & data_type)
{
return WhichDataType(data_type).isString();
}
template <typename T>
inline bool isFixedString(const T & data_type)
{
return WhichDataType(data_type).isFixedString();
}
template <typename T>
inline bool isStringOrFixedString(const T & data_type)
{
return WhichDataType(data_type).isStringOrFixedString();
}
template <typename T>
inline bool isNotCreatable(const T & data_type)
{
@ -567,12 +505,6 @@ inline bool isBool(const DataTypePtr & data_type)
return data_type->getName() == "Bool";
}
inline bool isAggregateFunction(const DataTypePtr & data_type)
{
WhichDataType which(data_type);
return which.isAggregateFunction();
}
inline bool isNullableOrLowCardinalityNullable(const DataTypePtr & data_type)
{
return data_type->isNullable() || data_type->isLowCardinalityNullable();

View File

@ -49,7 +49,7 @@ public:
{
const auto & pos_arg = arguments[i];
if (!isUnsignedInteger(pos_arg))
if (!isUInt(pos_arg))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {}", pos_arg->getName(), i, getName());
}

View File

@ -365,7 +365,7 @@ DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes &
for (size_t i = 0; i != arguments.size(); ++i)
{
if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull())
if (!isUInt(arguments[i]) && !arguments[i]->onlyNull())
{
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,

View File

@ -2033,7 +2033,7 @@ static inline bool isDateTime64(const ColumnsWithTypeAndName & arguments)
else if constexpr (std::is_same_v<Name, NameToDateTime> || std::is_same_v<Name, NameParseDateTimeBestEffort>
|| std::is_same_v<Name, NameParseDateTimeBestEffortOrZero> || std::is_same_v<Name, NameParseDateTimeBestEffortOrNull>)
{
return (arguments.size() == 2 && isUnsignedInteger(arguments[1].type)) || arguments.size() == 3;
return (arguments.size() == 2 && isUInt(arguments[1].type)) || arguments.size() == 3;
}
return false;

View File

@ -60,7 +60,7 @@ public:
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName());
if (!isUnsignedInteger(arguments[1]))
if (!isUInt(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName());
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[2].get());

View File

@ -64,7 +64,7 @@ public:
if (arguments.size() > 1)
{
if (!isUnsignedInteger(arguments[1].type))
if (!isUInt(arguments[1].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Second argument (shingle size) of function {} must be unsigned integer, got {}",
getName(), arguments[1].type->getName());
@ -85,7 +85,7 @@ public:
"Function {} expect no more than two arguments (text, shingle size), got {}",
getName(), arguments.size());
if (!isUnsignedInteger(arguments[2].type))
if (!isUInt(arguments[2].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Third argument (num hashes) of function {} must be unsigned integer, got {}",
getName(), arguments[2].type->getName());

View File

@ -119,7 +119,7 @@ public:
if (arguments.size() >= 3)
{
if (!isUnsignedInteger(arguments[2]))
if (!isUInt(arguments[2]))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}",

View File

@ -59,7 +59,7 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors args{
{"days", &isNativeUnsignedInteger<IDataType>, nullptr, "UInt*"}
{"days", &isNativeUInt<IDataType>, nullptr, "UInt*"}
};
validateFunctionArgumentTypes(*this, arguments, args);

View File

@ -41,7 +41,7 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isUnsignedInteger(arguments[0].type))
if (!isUInt(arguments[0].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be unsigned integer", getName());
if (!arguments[0].column || !isColumnConst(*arguments[0].column))

View File

@ -47,7 +47,7 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isUnsignedInteger(arguments[1].type))
if (!isUInt(arguments[1].type))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be unsigned integer", getName());
if (!arguments[1].column)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant", getName());

View File

@ -1147,7 +1147,7 @@ public:
double p;
if (isFloat(p_column.column->getDataType()))
p = p_column.column->getFloat64(0);
else if (isUnsignedInteger(p_column.column->getDataType()))
else if (isUInt(p_column.column->getDataType()))
p = p_column.column->getUInt(0);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be either constant Float64 or constant UInt", getName());

View File

@ -57,7 +57,7 @@ public:
{
for (size_t i = 0; i < 4; ++i)
{
if (!isUnsignedInteger(arguments[i].type))
if (!isUInt(arguments[i].type))
{
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,

View File

@ -226,7 +226,7 @@ FillingTransform::FillingTransform(
throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION,
"Incompatible types of WITH FILL expression values with column type {}", type->getName());
if (isUnsignedInteger(type) &&
if (isUInt(type) &&
((!descr.fill_from.isNull() && less(descr.fill_from, Field{0}, 1)) ||
(!descr.fill_to.isNull() && less(descr.fill_to, Field{0}, 1))))
{

View File

@ -36,12 +36,12 @@ SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF\xFF\xFF\xF
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x41\xE2\x82\xAC'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x9F\x99\x82'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8('😃🌍', '🙃😃🌑'), stringJaccardIndex('😃🌍', '🙃😃🌑');

View File

@ -2286,6 +2286,7 @@ stochasticlogisticregression
storages
storig
stringJaccardIndex
stringJaccardIndexUTF
stringToH
stripelog
strtod