Merge pull request #64641 from azat/fix-float-inference

Fix type inference for float (in case of small buffer)
2024-11-27 01:51:59 +00:00 · 2024-06-12 11:38:54 +00:00 · 2024-06-12 11:38:54 +00:00 · 085c406f1f
commit 085c406f1f
parent 31a978d75d 918d3849e1
4 changed files with 116 additions and 66 deletions
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@ -879,11 +879,11 @@ namespace
    }

    template <bool is_json>
-    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
+    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional)
    {
        if (is_json || settings.try_infer_exponent_floats)
-            return tryReadFloatText(value, buf);
-        return tryReadFloatTextNoExponent(value, buf);
+            return tryReadFloatTextExt(value, buf, has_fractional);
+        return tryReadFloatTextExtNoExponent(value, buf, has_fractional);
    }

    template <bool is_json>
@ -893,46 +893,31 @@ namespace
            return nullptr;

        Float64 tmp_float;
+        bool has_fractional;
        if (settings.try_infer_integers)
        {
            /// If we read from String, we can do it in a more efficient way.
            if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf))
            {
                /// Remember the pointer to the start of the number to rollback to it.
-                char * number_start = buf.position();
-                Int64 tmp_int;
-                bool read_int = tryReadIntText(tmp_int, buf);
-                /// If we reached eof, it cannot be float (it requires no less data than integer)
-                if (buf.eof())
-                    return read_int ? std::make_shared<DataTypeInt64>() : nullptr;
-
-                char * int_end = buf.position();
                /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof.
-                buf.position() = number_start;
+                char * number_start = buf.position();

-                bool read_uint = false;
-                char * uint_end = nullptr;
-                /// In case of Int64 overflow we can try to infer UInt64.
-                if (!read_int)
-                {
-                    UInt64 tmp_uint;
-                    read_uint = tryReadIntText(tmp_uint, buf);
-                    /// If we reached eof, it cannot be float (it requires no less data than integer)
-                    if (buf.eof())
-                        return read_uint ? std::make_shared<DataTypeUInt64>() : nullptr;
-
-                    uint_end = buf.position();
-                    buf.position() = number_start;
-                }
-
-                if (tryReadFloat<is_json>(tmp_float, buf, settings))
-                {
-                    if (read_int && buf.position() == int_end)
-                        return std::make_shared<DataTypeInt64>();
-                    if (read_uint && buf.position() == uint_end)
-                        return std::make_shared<DataTypeUInt64>();
+                /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e'
+                /// But, for now it is true
+                if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional) && has_fractional)
                    return std::make_shared<DataTypeFloat64>();
-                }
+
+                Int64 tmp_int;
+                buf.position() = number_start;
+                if (tryReadIntText(tmp_int, buf))
+                    return std::make_shared<DataTypeInt64>();
+
+                /// In case of Int64 overflow we can try to infer UInt64.
+                UInt64 tmp_uint;
+                buf.position() = number_start;
+                if (tryReadIntText(tmp_uint, buf))
+                    return std::make_shared<DataTypeUInt64>();

                return nullptr;
            }
@ -942,36 +927,22 @@ namespace
            /// and then as float.
            PeekableReadBuffer peekable_buf(buf);
            PeekableReadBufferCheckpoint checkpoint(peekable_buf);
-            Int64 tmp_int;
-            bool read_int = tryReadIntText(tmp_int, peekable_buf);
-            auto * int_end = peekable_buf.position();
-            peekable_buf.rollbackToCheckpoint(true);

-            bool read_uint = false;
-            char * uint_end = nullptr;
-            /// In case of Int64 overflow we can try to infer UInt64.
-            if (!read_int)
-            {
-                PeekableReadBufferCheckpoint new_checkpoint(peekable_buf);
-                UInt64 tmp_uint;
-                read_uint = tryReadIntText(tmp_uint, peekable_buf);
-                uint_end = peekable_buf.position();
-                peekable_buf.rollbackToCheckpoint(true);
-            }
-
-            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
-            {
-                /// Float parsing reads no fewer bytes than integer parsing,
-                /// so position of the buffer is either the same, or further.
-                /// If it's the same, then it's integer.
-                if (read_int && peekable_buf.position() == int_end)
-                    return std::make_shared<DataTypeInt64>();
-                if (read_uint && peekable_buf.position() == uint_end)
-                    return std::make_shared<DataTypeUInt64>();
+            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings, has_fractional) && has_fractional)
                return std::make_shared<DataTypeFloat64>();
+            peekable_buf.rollbackToCheckpoint(/* drop= */ false);
+
+            Int64 tmp_int;
+            if (tryReadIntText(tmp_int, peekable_buf))
+                return std::make_shared<DataTypeInt64>();
+            peekable_buf.rollbackToCheckpoint(/* drop= */ true);
+
+            /// In case of Int64 overflow we can try to infer UInt64.
+            UInt64 tmp_uint;
+            if (tryReadIntText(tmp_uint, peekable_buf))
+                return std::make_shared<DataTypeUInt64>();
        }
-        }
-        else if (tryReadFloat<is_json>(tmp_float, buf, settings))
+        else if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional))
        {
            return std::make_shared<DataTypeFloat64>();
        }
@ -1004,7 +975,8 @@ namespace
        buf.position() = buf.buffer().begin();

        Float64 tmp;
-        if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
+        bool has_fractional;
+        if (tryReadFloat<is_json>(tmp, buf, settings, has_fractional) && buf.eof())
            return std::make_shared<DataTypeFloat64>();

        return nullptr;
--- a/src/IO/readFloatText.h
+++ b/src/IO/readFloatText.h
@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)


 template <typename T, typename ReturnType, bool allow_exponent = true>
-ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
+ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional)
 {
    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
    static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");

+    has_fractional = false;
+
    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;

    bool negative = false;
@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)

    if (checkChar('.', in))
    {
+        has_fractional = true;
        auto after_point_count = in.count();

        while (!in.eof() && *in.position() == '0')
@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
    {
        if (checkChar('e', in) || checkChar('E', in))
        {
+            has_fractional = true;
            if (in.eof())
            {
                if constexpr (throw_exception)
@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
    }

    if (after_point)
+    {
        x += static_cast<T>(shift10(after_point, after_point_exponent));
+    }

    if (exponent)
+    {
        x = static_cast<T>(shift10(x, exponent));
+    }

    if (negative)
        x = -x;
@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
 template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
 template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }

-template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
-template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in)
+{
+    bool has_fractional;
+    readFloatTextFastImpl<T, void>(x, in, has_fractional);
+}
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in)
+{
+    bool has_fractional;
+    return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
+}

 template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
 template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
@ -603,6 +619,21 @@ template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatText
 template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }

 /// Don't read exponent part of the number.
-template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool, false>(x, in); }
+template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in)
+{
+    bool has_fractional;
+    return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
+}
+
+/// With a @has_fractional flag
+/// Used for input_format_try_infer_integers
+template <typename T> bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional)
+{
+    return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
+}
+template <typename T> bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional)
+{
+    return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
+}

 }
--- a/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference
+++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference
@ -0,0 +1,15 @@
+Int64
+x	Nullable(Int64)					
+x	Nullable(Int64)					
+x	Nullable(Int64)					
+Float64
+x	Nullable(Float64)					
+x	Nullable(Float64)					
+x	Nullable(Float64)					
+x	Nullable(Float64)					
+Float64.explicit File
+x	Nullable(Float64)					
+Float64.pipe
+x	Nullable(Float64)					
+Float64.default max_read_buffer_size
+x	Nullable(Float64)					
--- a/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh
+++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+# do not fallback to float always
+echo "Int64"
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}'
+
+echo "Float64"
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}'
+
+# this is requried due to previously clickhouse-local does not interprets
+# --max_read_buffer_size for fds [1]
+#
+#   [1]: https://github.com/ClickHouse/ClickHouse/pull/64532
+echo "Float64.explicit File"
+tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX")
+trap 'rm -f $tmp_path' EXIT
+cat > "$tmp_path" <<<'{"x" : 1.111}'
+$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path"
+
+echo "Float64.pipe"
+echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"'
+echo "Float64.default max_read_buffer_size"
+echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"'