From 27ee17ab02628113450bf41b3c21cedcd5e141a6 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 12 Sep 2023 18:06:57 +0000 Subject: [PATCH 1/4] Better integer types inference for Int64/UInt64 fields --- src/DataTypes/DataTypeUInt64OrInt64.h | 13 +++++ src/DataTypes/DataTypesNumber.h | 23 +++++++- src/DataTypes/FieldToDataType.cpp | 52 ++----------------- src/DataTypes/FieldToDataType.h | 10 ---- src/DataTypes/getLeastSupertype.cpp | 31 +++++++++++ .../02832_integer_type_inference.reference | 8 +++ .../02832_integer_type_inference.sql | 9 ++++ 7 files changed, 87 insertions(+), 59 deletions(-) create mode 100644 src/DataTypes/DataTypeUInt64OrInt64.h diff --git a/src/DataTypes/DataTypeUInt64OrInt64.h b/src/DataTypes/DataTypeUInt64OrInt64.h new file mode 100644 index 00000000000..8a11e42cde1 --- /dev/null +++ b/src/DataTypes/DataTypeUInt64OrInt64.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace DB +{ + +class DataTypeUInt64OrInt64 : public DataTypeUInt64 +{ + +}; + +} diff --git a/src/DataTypes/DataTypesNumber.h b/src/DataTypes/DataTypesNumber.h index 5843086248c..92f01579459 100644 --- a/src/DataTypes/DataTypesNumber.h +++ b/src/DataTypes/DataTypesNumber.h @@ -9,10 +9,17 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + template -class DataTypeNumber final : public DataTypeNumberBase +class DataTypeNumber : public DataTypeNumberBase { public: + DataTypeNumber() = default; + bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } bool canBeUsedAsVersion() const override { return true; } @@ -32,6 +39,20 @@ public: { return std::make_shared>(); } + + /// Special constructor for unsigned integers that can also fit into signed integer. + /// It's used for better type inference from fields. + /// See getLeastSupertype.cpp::convertUInt64toInt64IfPossible and FieldToDataType.cpp + explicit DataTypeNumber(bool unsigned_can_be_signed_) : DataTypeNumberBase(), unsigned_can_be_signed(unsigned_can_be_signed_) + { + if constexpr (std::is_signed_v) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeNumber constructor with bool argument should not be used with signed integers"); + } + + bool canUnsignedBeSigned() const { return unsigned_can_be_signed; } + +private: + bool unsigned_can_be_signed = false; }; using DataTypeUInt8 = DataTypeNumber; diff --git a/src/DataTypes/FieldToDataType.cpp b/src/DataTypes/FieldToDataType.cpp index 837aae6753a..0131a3956f4 100644 --- a/src/DataTypes/FieldToDataType.cpp +++ b/src/DataTypes/FieldToDataType.cpp @@ -36,6 +36,7 @@ DataTypePtr FieldToDataType::operator() (const UInt64 & x) const if (x <= std::numeric_limits::max()) return std::make_shared(); if (x <= std::numeric_limits::max()) return std::make_shared(); if (x <= std::numeric_limits::max()) return std::make_shared(); + if (x <= std::numeric_limits::max()) return std::make_shared(/*unsigned_can_be_signed=*/true); return std::make_shared(); } @@ -136,17 +137,8 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const DataTypes element_types; element_types.reserve(x.size()); - bool has_signed_int = false; - bool uint64_convert_possible = true; for (const Field & elem : x) - { - DataTypePtr type = applyVisitor(*this, elem); - element_types.emplace_back(type); - checkUInt64ToIn64Conversion(has_signed_int, uint64_convert_possible, type, elem); - } - - if (has_signed_int && uint64_convert_possible) - convertUInt64ToInt64IfPossible(element_types); + element_types.emplace_back(applyVisitor(*this, elem)); return std::make_shared(getLeastSupertype(element_types)); } @@ -174,28 +166,14 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const key_types.reserve(map.size()); value_types.reserve(map.size()); - bool k_has_signed_int = false; - bool k_uint64_convert_possible = true; - bool v_has_signed_int = false; - bool v_uint64_convert_possible = true; for (const auto & elem : map) { const auto & tuple = elem.safeGet(); assert(tuple.size() == 2); - DataTypePtr k_type = applyVisitor(*this, tuple[0]); - key_types.push_back(k_type); - checkUInt64ToIn64Conversion(k_has_signed_int, k_uint64_convert_possible, k_type, tuple[0]); - DataTypePtr v_type = applyVisitor(*this, tuple[1]); - value_types.push_back(v_type); - checkUInt64ToIn64Conversion(v_has_signed_int, v_uint64_convert_possible, v_type, tuple[1]); + key_types.push_back(applyVisitor(*this, tuple[0])); + value_types.push_back(applyVisitor(*this, tuple[1])); } - if (k_has_signed_int && k_uint64_convert_possible) - convertUInt64ToInt64IfPossible(key_types); - - if (v_has_signed_int && v_uint64_convert_possible) - convertUInt64ToInt64IfPossible(value_types); - return std::make_shared( getLeastSupertype(key_types), getLeastSupertype(value_types)); @@ -227,28 +205,6 @@ DataTypePtr FieldToDataType::operator()(const bool &) const return DataTypeFactory::instance().get("Bool"); } -template -void FieldToDataType::checkUInt64ToIn64Conversion(bool & has_signed_int, bool & uint64_convert_possible, const DataTypePtr & type, const Field & elem) const -{ - if (uint64_convert_possible) - { - bool is_native_int = WhichDataType(type).isNativeInt(); - - if (is_native_int) - has_signed_int |= is_native_int; - else if (type->getTypeId() == TypeIndex::UInt64) - uint64_convert_possible &= (elem.template get() <= std::numeric_limits::max()); - } -} - -template -void FieldToDataType::convertUInt64ToInt64IfPossible(DataTypes & data_types) const -{ - for (auto& type : data_types) - if (type->getTypeId() == TypeIndex::UInt64) - type = std::make_shared(); -} - template class FieldToDataType; template class FieldToDataType; template class FieldToDataType; diff --git a/src/DataTypes/FieldToDataType.h b/src/DataTypes/FieldToDataType.h index d1a3f11e8de..8febadc1a0d 100644 --- a/src/DataTypes/FieldToDataType.h +++ b/src/DataTypes/FieldToDataType.h @@ -45,16 +45,6 @@ public: DataTypePtr operator() (const UInt256 & x) const; DataTypePtr operator() (const Int256 & x) const; DataTypePtr operator() (const bool & x) const; - -private: - // The conditions for converting UInt64 to Int64 are: - // 1. The existence of Int. - // 2. The existence of UInt64, and the UInt64 value must be <= Int64.max. - void checkUInt64ToIn64Conversion(bool& has_signed_int, bool& uint64_convert_possible, const DataTypePtr & type, const Field & elem) const; - - // Convert the UInt64 type to Int64 in order to cover other signed_integer types - // and obtain the least super type of all ints. - void convertUInt64ToInt64IfPossible(DataTypes & data_types) const; }; } diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index 9d42d82ce91..e5bdb4b267f 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -198,6 +198,35 @@ DataTypePtr getNumericType(const TypeIndexSet & types) return {}; } +/// Check if we can convert UInt64 to Int64 to avoid error "There is no supertype for types UInt64, Int64" +/// during inferring field types. +/// Example: +/// [-3236599669630092879, 5607475129431807682] +/// First field is inferred as Int64, but second one as UInt64, although it also can be Int64. +/// We don't support Int128 as supertype for Int64 and UInt64, because Int128 is inefficient. +/// But in this case the result type can be inferred as Array(Int64). +void convertUInt64toInt64IfPossible(const DataTypes & types, TypeIndexSet & types_set) +{ + /// Check if we have UInt64 and at least one Integer type. + if (!types_set.contains(TypeIndex::UInt64) + || (!types_set.contains(TypeIndex::Int8) && !types_set.contains(TypeIndex::Int16) && !types_set.contains(TypeIndex::Int32) + && !types_set.contains(TypeIndex::Int64))) + return; + + bool all_uint64_can_be_int64 = true; + for (const auto & type : types) + { + if (const auto * uint64_type = typeid_cast(type.get())) + all_uint64_can_be_int64 &= uint64_type->canUnsignedBeSigned(); + } + + if (all_uint64_can_be_int64) + { + types_set.erase(TypeIndex::UInt64); + types_set.insert(TypeIndex::Int64); + } +} + } template @@ -592,6 +621,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) /// For numeric types, the most complicated part. { + /// First, if we have signed integers, try to convert all UInt64 to Int64 if possible. + convertUInt64toInt64IfPossible(types, type_ids); auto numeric_type = getNumericType(type_ids); if (numeric_type) return numeric_type; diff --git a/tests/queries/0_stateless/02832_integer_type_inference.reference b/tests/queries/0_stateless/02832_integer_type_inference.reference index 5a01bd4cd11..775fae2e0d2 100644 --- a/tests/queries/0_stateless/02832_integer_type_inference.reference +++ b/tests/queries/0_stateless/02832_integer_type_inference.reference @@ -1,2 +1,10 @@ [-4741124612489978151,-3236599669630092879,5607475129431807682] [100,-100,5607475129431807682,5607475129431807683] +[[-4741124612489978151],[-3236599669630092879,5607475129431807682]] +[[-4741124612489978151,-3236599669630092879],[5607475129431807682]] +[(-4741124612489978151,1),(-3236599669630092879,2),(560747512943180768,3)] +[-4741124612489978151,1,-3236599669630092879,2,560747512943180768,3] +{-4741124612489978151:1,-3236599669630092879:2,5607475129431807682:3} +[{-4741124612489978151:1,-3236599669630092879:2,5607475129431807682:3},{-1:1}] +{1:-4741124612489978151,2:-3236599669630092879,3:5607475129431807682} +[{1:-4741124612489978151,2:-3236599669630092879,3:5607475129431807682},{-1:1}] diff --git a/tests/queries/0_stateless/02832_integer_type_inference.sql b/tests/queries/0_stateless/02832_integer_type_inference.sql index 221e929d705..85ad7f55869 100644 --- a/tests/queries/0_stateless/02832_integer_type_inference.sql +++ b/tests/queries/0_stateless/02832_integer_type_inference.sql @@ -1,2 +1,11 @@ select [-4741124612489978151, -3236599669630092879, 5607475129431807682]; select [100, -100, 5607475129431807682, 5607475129431807683]; +select [[-4741124612489978151], [-3236599669630092879, 5607475129431807682]]; +select [[-4741124612489978151, -3236599669630092879], [5607475129431807682]]; +select [tuple(-4741124612489978151, 1), tuple(-3236599669630092879, 2), tuple(560747512943180768, 3)]; +select array(-4741124612489978151, 1, -3236599669630092879, 2, 560747512943180768, 3); +select map(-4741124612489978151, 1, -3236599669630092879, 2, 5607475129431807682, 3); +select [map(-4741124612489978151, 1, -3236599669630092879, 2, 5607475129431807682, 3), map(-1, 1)]; +select map(1, -4741124612489978151, 2, -3236599669630092879, 3, 5607475129431807682); +select [map(1, -4741124612489978151, 2, -3236599669630092879, 3, 5607475129431807682), map(-1, 1)]; + From c6884d4cb90e2a38bdeb00d042703305932497cb Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 12 Sep 2023 20:19:02 +0200 Subject: [PATCH 2/4] Remove unused file --- src/DataTypes/DataTypeUInt64OrInt64.h | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 src/DataTypes/DataTypeUInt64OrInt64.h diff --git a/src/DataTypes/DataTypeUInt64OrInt64.h b/src/DataTypes/DataTypeUInt64OrInt64.h deleted file mode 100644 index 8a11e42cde1..00000000000 --- a/src/DataTypes/DataTypeUInt64OrInt64.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class DataTypeUInt64OrInt64 : public DataTypeUInt64 -{ - -}; - -} From e229fd54c9a2adab5905cba2a9f76048cdc2ccf6 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 12 Sep 2023 20:19:36 +0200 Subject: [PATCH 3/4] Return final --- src/DataTypes/DataTypesNumber.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/DataTypesNumber.h b/src/DataTypes/DataTypesNumber.h index 92f01579459..63d98dbf0f8 100644 --- a/src/DataTypes/DataTypesNumber.h +++ b/src/DataTypes/DataTypesNumber.h @@ -15,7 +15,7 @@ namespace ErrorCodes } template -class DataTypeNumber : public DataTypeNumberBase +class DataTypeNumber final : public DataTypeNumberBase { public: DataTypeNumber() = default; From 2e8ac2bdf9718e2f0b7e09af134cad7911b730db Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 15 Sep 2023 12:53:50 +0000 Subject: [PATCH 4/4] Fix function if --- src/DataTypes/DataTypesNumber.cpp | 6 ++++++ src/DataTypes/DataTypesNumber.h | 2 ++ src/Functions/if.cpp | 8 ++++++++ tests/queries/0_stateless/01065_if_not_finite.sql | 2 +- .../0_stateless/02832_integer_type_inference.reference | 1 + .../queries/0_stateless/02832_integer_type_inference.sql | 2 +- 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/DataTypes/DataTypesNumber.cpp b/src/DataTypes/DataTypesNumber.cpp index 232a5101cbe..1c0c418411b 100644 --- a/src/DataTypes/DataTypesNumber.cpp +++ b/src/DataTypes/DataTypesNumber.cpp @@ -35,6 +35,12 @@ static DataTypePtr createNumericDataType(const ASTPtr & arguments) return std::make_shared>(); } +bool isUInt64ThatCanBeInt64(const DataTypePtr & type) +{ + const DataTypeUInt64 * uint64_type = typeid_cast(type.get()); + return uint64_type && uint64_type->canUnsignedBeSigned(); +} + void registerDataTypeNumbers(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypesNumber.h b/src/DataTypes/DataTypesNumber.h index 63d98dbf0f8..0c1f88a7925 100644 --- a/src/DataTypes/DataTypesNumber.h +++ b/src/DataTypes/DataTypesNumber.h @@ -71,4 +71,6 @@ using DataTypeInt128 = DataTypeNumber; using DataTypeUInt256 = DataTypeNumber; using DataTypeInt256 = DataTypeNumber; +bool isUInt64ThatCanBeInt64(const DataTypePtr & type); + } diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 65e2212e894..a955230f3d3 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1105,6 +1105,14 @@ public: if (const auto * right_array = checkAndGetDataType(arg_else.type.get())) right_id = right_array->getNestedType()->getTypeId(); + /// Special case when one column is Integer and another is UInt64 that can be actually Int64. + /// The result type for this case is Int64 and we need to change UInt64 type to Int64 + /// so the NumberTraits::ResultOfIf will return Int64 instead if Int128. + if (isNativeInteger(arg_then.type) && isUInt64ThatCanBeInt64(arg_else.type)) + right_id = TypeIndex::Int64; + else if (isNativeInteger(arg_else.type) && isUInt64ThatCanBeInt64(arg_then.type)) + left_id = TypeIndex::Int64; + if (!(callOnBasicTypes(left_id, right_id, call) || (res = executeTyped(cond_col, arguments, result_type, input_rows_count)) || (res = executeString(cond_col, arguments, result_type)) diff --git a/tests/queries/0_stateless/01065_if_not_finite.sql b/tests/queries/0_stateless/01065_if_not_finite.sql index 495932692ea..c0f0721b2dc 100644 --- a/tests/queries/0_stateless/01065_if_not_finite.sql +++ b/tests/queries/0_stateless/01065_if_not_finite.sql @@ -6,6 +6,6 @@ SELECT ifNotFinite(nan, 2); SELECT ifNotFinite(-1 / 0, 2); SELECT ifNotFinite(log(0), NULL); SELECT ifNotFinite(sqrt(-1), -42); -SELECT ifNotFinite(1234567890123456789, -1234567890123456789); -- { serverError 386 } +SELECT ifNotFinite(12345678901234567890, -12345678901234567890); -- { serverError 386 } SELECT ifNotFinite(NULL, 1); diff --git a/tests/queries/0_stateless/02832_integer_type_inference.reference b/tests/queries/0_stateless/02832_integer_type_inference.reference index 775fae2e0d2..bc738afd538 100644 --- a/tests/queries/0_stateless/02832_integer_type_inference.reference +++ b/tests/queries/0_stateless/02832_integer_type_inference.reference @@ -8,3 +8,4 @@ [{-4741124612489978151:1,-3236599669630092879:2,5607475129431807682:3},{-1:1}] {1:-4741124612489978151,2:-3236599669630092879,3:5607475129431807682} [{1:-4741124612489978151,2:-3236599669630092879,3:5607475129431807682},{-1:1}] +-1234567890123456789 diff --git a/tests/queries/0_stateless/02832_integer_type_inference.sql b/tests/queries/0_stateless/02832_integer_type_inference.sql index 85ad7f55869..c6e7c744f39 100644 --- a/tests/queries/0_stateless/02832_integer_type_inference.sql +++ b/tests/queries/0_stateless/02832_integer_type_inference.sql @@ -8,4 +8,4 @@ select map(-4741124612489978151, 1, -3236599669630092879, 2, 5607475129431807682 select [map(-4741124612489978151, 1, -3236599669630092879, 2, 5607475129431807682, 3), map(-1, 1)]; select map(1, -4741124612489978151, 2, -3236599669630092879, 3, 5607475129431807682); select [map(1, -4741124612489978151, 2, -3236599669630092879, 3, 5607475129431807682), map(-1, 1)]; - +select if(materialize(1), -1234567890123456789, 1234567890123456789);