Merge pull request #64641 from azat/fix-float-inference

Fix type inference for float (in case of small buffer)
2024-11-23 08:02:02 +00:00 · 2024-06-12 11:38:54 +00:00 · 2024-06-12 11:38:54 +00:00 · 085c406f1f
commit 085c406f1f
parent 31a978d75d 918d3849e1
4 changed files with 116 additions and 66 deletions
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@ -879,11 +879,11 @@ namespace
    }
    template <bool is_json>
-    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
+    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional)
    {
        if (is_json || settings.try_infer_exponent_floats)
-            return tryReadFloatText(value, buf);
+            return tryReadFloatTextExt(value, buf, has_fractional);
-        return tryReadFloatTextNoExponent(value, buf);
+        return tryReadFloatTextExtNoExponent(value, buf, has_fractional);
    }
    template <bool is_json>
@ -893,46 +893,31 @@ namespace
            return nullptr;
        Float64 tmp_float;
        bool has_fractional;
        if (settings.try_infer_integers)
        {
            /// If we read from String, we can do it in a more efficient way.
            if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf))
            {
                /// Remember the pointer to the start of the number to rollback to it.
                char * number_start = buf.position();
                Int64 tmp_int;
                bool read_int = tryReadIntText(tmp_int, buf);
                /// If we reached eof, it cannot be float (it requires no less data than integer)
                if (buf.eof())
                    return read_int ? std::make_shared<DataTypeInt64>() : nullptr;
                char * int_end = buf.position();
                /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof.
-                buf.position() = number_start;
+                char * number_start = buf.position();
-                bool read_uint = false;
+                /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e'
-                char * uint_end = nullptr;
+                /// But, for now it is true
-                /// In case of Int64 overflow we can try to infer UInt64.
+                if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional) && has_fractional)
                if (!read_int)
                {
                    UInt64 tmp_uint;
                    read_uint = tryReadIntText(tmp_uint, buf);
                    /// If we reached eof, it cannot be float (it requires no less data than integer)
                    if (buf.eof())
                        return read_uint ? std::make_shared<DataTypeUInt64>() : nullptr;
                    uint_end = buf.position();
                    buf.position() = number_start;
                }
                if (tryReadFloat<is_json>(tmp_float, buf, settings))
                {
                    if (read_int && buf.position() == int_end)
                        return std::make_shared<DataTypeInt64>();
                    if (read_uint && buf.position() == uint_end)
                        return std::make_shared<DataTypeUInt64>();
                    return std::make_shared<DataTypeFloat64>();
-                }
+
                Int64 tmp_int;
                buf.position() = number_start;
                if (tryReadIntText(tmp_int, buf))
                    return std::make_shared<DataTypeInt64>();
                /// In case of Int64 overflow we can try to infer UInt64.
                UInt64 tmp_uint;
                buf.position() = number_start;
                if (tryReadIntText(tmp_uint, buf))
                    return std::make_shared<DataTypeUInt64>();
                return nullptr;
            }
@ -942,36 +927,22 @@ namespace
            /// and then as float.
            PeekableReadBuffer peekable_buf(buf);
            PeekableReadBufferCheckpoint checkpoint(peekable_buf);
            Int64 tmp_int;
            bool read_int = tryReadIntText(tmp_int, peekable_buf);
            auto * int_end = peekable_buf.position();
            peekable_buf.rollbackToCheckpoint(true);
-            bool read_uint = false;
+            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings, has_fractional) && has_fractional)
            char * uint_end = nullptr;
            /// In case of Int64 overflow we can try to infer UInt64.
            if (!read_int)
            {
                PeekableReadBufferCheckpoint new_checkpoint(peekable_buf);
                UInt64 tmp_uint;
                read_uint = tryReadIntText(tmp_uint, peekable_buf);
                uint_end = peekable_buf.position();
                peekable_buf.rollbackToCheckpoint(true);
            }
            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
            {
                /// Float parsing reads no fewer bytes than integer parsing,
                /// so position of the buffer is either the same, or further.
                /// If it's the same, then it's integer.
                if (read_int && peekable_buf.position() == int_end)
                    return std::make_shared<DataTypeInt64>();
                if (read_uint && peekable_buf.position() == uint_end)
                    return std::make_shared<DataTypeUInt64>();
                return std::make_shared<DataTypeFloat64>();
-            }
+            peekable_buf.rollbackToCheckpoint(/* drop= */ false);
            Int64 tmp_int;
            if (tryReadIntText(tmp_int, peekable_buf))
                return std::make_shared<DataTypeInt64>();
            peekable_buf.rollbackToCheckpoint(/* drop= */ true);
            /// In case of Int64 overflow we can try to infer UInt64.
            UInt64 tmp_uint;
            if (tryReadIntText(tmp_uint, peekable_buf))
                return std::make_shared<DataTypeUInt64>();
        }
-        else if (tryReadFloat<is_json>(tmp_float, buf, settings))
+        else if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional))
        {
            return std::make_shared<DataTypeFloat64>();
        }
@ -1004,7 +975,8 @@ namespace
        buf.position() = buf.buffer().begin();
        Float64 tmp;
-        if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
+        bool has_fractional;
        if (tryReadFloat<is_json>(tmp, buf, settings, has_fractional) && buf.eof())
            return std::make_shared<DataTypeFloat64>();
        return nullptr;
--- a/src/IO/readFloatText.h
+++ b/src/IO/readFloatText.h
@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
 template <typename T, typename ReturnType, bool allow_exponent = true>
-ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
+ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional)
 {
    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
    static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
    has_fractional = false;
    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
    bool negative = false;
@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
    if (checkChar('.', in))
    {
        has_fractional = true;
        auto after_point_count = in.count();
        while (!in.eof() && *in.position() == '0')
@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
    {
        if (checkChar('e', in) || checkChar('E', in))
        {
            has_fractional = true;
            if (in.eof())
            {
                if constexpr (throw_exception)
@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
    }
    if (after_point)
    {
        x += static_cast<T>(shift10(after_point, after_point_exponent));
    }
    if (exponent)
    {
        x = static_cast<T>(shift10(x, exponent));
    }
    if (negative)
        x = -x;
@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
 template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
 template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
-template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in)
-template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
+{
    bool has_fractional;
    readFloatTextFastImpl<T, void>(x, in, has_fractional);
 }
 template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in)
 {
    bool has_fractional;
    return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
 }
 template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
 template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
@ -603,6 +619,21 @@ template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatText
 template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
 /// Don't read exponent part of the number.
-template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool, false>(x, in); }
+template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in)
 {
    bool has_fractional;
    return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
 }
 /// With a @has_fractional flag
 /// Used for input_format_try_infer_integers
 template <typename T> bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional)
 {
    return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
 }
 template <typename T> bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional)
 {
    return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
 }
 }
--- a/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference
+++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference
@ -0,0 +1,15 @@
 Int64
 x	Nullable(Int64)					
 x	Nullable(Int64)					
 x	Nullable(Int64)					
 Float64
 x	Nullable(Float64)					
 x	Nullable(Float64)					
 x	Nullable(Float64)					
 x	Nullable(Float64)					
 Float64.explicit File
 x	Nullable(Float64)					
 Float64.pipe
 x	Nullable(Float64)					
 Float64.default max_read_buffer_size
 x	Nullable(Float64)					
--- a/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh
+++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CUR_DIR"/../shell_config.sh
 # do not fallback to float always
 echo "Int64"
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}'
 echo "Float64"
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}'
 # this is requried due to previously clickhouse-local does not interprets
 # --max_read_buffer_size for fds [1]
 #
 #   [1]: https://github.com/ClickHouse/ClickHouse/pull/64532
 echo "Float64.explicit File"
 tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX")
 trap 'rm -f $tmp_path' EXIT
 cat > "$tmp_path" <<<'{"x" : 1.111}'
 $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path"
 echo "Float64.pipe"
 echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"'
 echo "Float64.default max_read_buffer_size"
 echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"'