Merge pull request #64641 from azat/fix-float-inference

Fix type inference for float (in case of small buffer)
This commit is contained in:
Kruglov Pavel 2024-06-12 11:38:54 +00:00 committed by GitHub
commit 085c406f1f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 116 additions and 66 deletions

View File

@ -879,11 +879,11 @@ namespace
}
template <bool is_json>
bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional)
{
if (is_json || settings.try_infer_exponent_floats)
return tryReadFloatText(value, buf);
return tryReadFloatTextNoExponent(value, buf);
return tryReadFloatTextExt(value, buf, has_fractional);
return tryReadFloatTextExtNoExponent(value, buf, has_fractional);
}
template <bool is_json>
@ -893,46 +893,31 @@ namespace
return nullptr;
Float64 tmp_float;
bool has_fractional;
if (settings.try_infer_integers)
{
/// If we read from String, we can do it in a more efficient way.
if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf))
{
/// Remember the pointer to the start of the number to rollback to it.
char * number_start = buf.position();
Int64 tmp_int;
bool read_int = tryReadIntText(tmp_int, buf);
/// If we reached eof, it cannot be float (it requires no less data than integer)
if (buf.eof())
return read_int ? std::make_shared<DataTypeInt64>() : nullptr;
char * int_end = buf.position();
/// We can safely get back to the start of the number, because we read from a string and we didn't reach eof.
buf.position() = number_start;
char * number_start = buf.position();
bool read_uint = false;
char * uint_end = nullptr;
/// In case of Int64 overflow we can try to infer UInt64.
if (!read_int)
{
UInt64 tmp_uint;
read_uint = tryReadIntText(tmp_uint, buf);
/// If we reached eof, it cannot be float (it requires no less data than integer)
if (buf.eof())
return read_uint ? std::make_shared<DataTypeUInt64>() : nullptr;
uint_end = buf.position();
buf.position() = number_start;
}
if (tryReadFloat<is_json>(tmp_float, buf, settings))
{
if (read_int && buf.position() == int_end)
return std::make_shared<DataTypeInt64>();
if (read_uint && buf.position() == uint_end)
return std::make_shared<DataTypeUInt64>();
/// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e'
/// But, for now it is true
if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional) && has_fractional)
return std::make_shared<DataTypeFloat64>();
}
Int64 tmp_int;
buf.position() = number_start;
if (tryReadIntText(tmp_int, buf))
return std::make_shared<DataTypeInt64>();
/// In case of Int64 overflow we can try to infer UInt64.
UInt64 tmp_uint;
buf.position() = number_start;
if (tryReadIntText(tmp_uint, buf))
return std::make_shared<DataTypeUInt64>();
return nullptr;
}
@ -942,36 +927,22 @@ namespace
/// and then as float.
PeekableReadBuffer peekable_buf(buf);
PeekableReadBufferCheckpoint checkpoint(peekable_buf);
Int64 tmp_int;
bool read_int = tryReadIntText(tmp_int, peekable_buf);
auto * int_end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint(true);
bool read_uint = false;
char * uint_end = nullptr;
/// In case of Int64 overflow we can try to infer UInt64.
if (!read_int)
{
PeekableReadBufferCheckpoint new_checkpoint(peekable_buf);
UInt64 tmp_uint;
read_uint = tryReadIntText(tmp_uint, peekable_buf);
uint_end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint(true);
}
if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
{
/// Float parsing reads no fewer bytes than integer parsing,
/// so position of the buffer is either the same, or further.
/// If it's the same, then it's integer.
if (read_int && peekable_buf.position() == int_end)
return std::make_shared<DataTypeInt64>();
if (read_uint && peekable_buf.position() == uint_end)
return std::make_shared<DataTypeUInt64>();
if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings, has_fractional) && has_fractional)
return std::make_shared<DataTypeFloat64>();
peekable_buf.rollbackToCheckpoint(/* drop= */ false);
Int64 tmp_int;
if (tryReadIntText(tmp_int, peekable_buf))
return std::make_shared<DataTypeInt64>();
peekable_buf.rollbackToCheckpoint(/* drop= */ true);
/// In case of Int64 overflow we can try to infer UInt64.
UInt64 tmp_uint;
if (tryReadIntText(tmp_uint, peekable_buf))
return std::make_shared<DataTypeUInt64>();
}
}
else if (tryReadFloat<is_json>(tmp_float, buf, settings))
else if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional))
{
return std::make_shared<DataTypeFloat64>();
}
@ -1004,7 +975,8 @@ namespace
buf.position() = buf.buffer().begin();
Float64 tmp;
if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
bool has_fractional;
if (tryReadFloat<is_json>(tmp, buf, settings, has_fractional) && buf.eof())
return std::make_shared<DataTypeFloat64>();
return nullptr;

View File

@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
template <typename T, typename ReturnType, bool allow_exponent = true>
ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional)
{
static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
has_fractional = false;
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
bool negative = false;
@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
if (checkChar('.', in))
{
has_fractional = true;
auto after_point_count = in.count();
while (!in.eof() && *in.position() == '0')
@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
{
if (checkChar('e', in) || checkChar('E', in))
{
has_fractional = true;
if (in.eof())
{
if constexpr (throw_exception)
@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
}
if (after_point)
{
x += static_cast<T>(shift10(after_point, after_point_exponent));
}
if (exponent)
{
x = static_cast<T>(shift10(x, exponent));
}
if (negative)
x = -x;
@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
template <typename T> void readFloatTextFast(T & x, ReadBuffer & in)
{
bool has_fractional;
readFloatTextFastImpl<T, void>(x, in, has_fractional);
}
template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in)
{
bool has_fractional;
return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
}
template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
@ -603,6 +619,21 @@ template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatText
template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
/// Don't read exponent part of the number.
template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool, false>(x, in); }
template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in)
{
bool has_fractional;
return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
}
/// With a @has_fractional flag
/// Used for input_format_try_infer_integers
template <typename T> bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional)
{
return readFloatTextFastImpl<T, bool>(x, in, has_fractional);
}
template <typename T> bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional)
{
return readFloatTextFastImpl<T, bool, false>(x, in, has_fractional);
}
}

View File

@ -0,0 +1,15 @@
Int64
x Nullable(Int64)
x Nullable(Int64)
x Nullable(Int64)
Float64
x Nullable(Float64)
x Nullable(Float64)
x Nullable(Float64)
x Nullable(Float64)
Float64.explicit File
x Nullable(Float64)
Float64.pipe
x Nullable(Float64)
Float64.default max_read_buffer_size
x Nullable(Float64)

View File

@ -0,0 +1,32 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# do not fallback to float always
echo "Int64"
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}'
echo "Float64"
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}'
# this is requried due to previously clickhouse-local does not interprets
# --max_read_buffer_size for fds [1]
#
# [1]: https://github.com/ClickHouse/ClickHouse/pull/64532
echo "Float64.explicit File"
tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX")
trap 'rm -f $tmp_path' EXIT
cat > "$tmp_path" <<<'{"x" : 1.111}'
$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path"
echo "Float64.pipe"
echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"'
echo "Float64.default max_read_buffer_size"
echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"'