ClickHouse/src/IO/readFloatText.h

595 lines
17 KiB
C++
Raw Normal View History

2020-10-10 18:37:02 +00:00
#pragma once
2018-01-13 04:43:10 +00:00
#include <type_traits>
#include <IO/ReadHelpers.h>
2018-01-13 22:08:06 +00:00
#include <Core/Defines.h>
2018-01-13 18:01:31 +00:00
#include <common/shift10.h>
#include <Common/StringUtils/StringUtils.h>
2018-01-13 04:43:10 +00:00
#include <double-conversion/double-conversion.h>
2020-12-16 15:34:29 +00:00
#ifdef __clang__
2020-12-15 19:56:47 +00:00
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
2020-12-16 15:34:29 +00:00
#endif
2020-12-15 19:56:47 +00:00
#include <fast_float/fast_float.h>
2020-12-16 15:34:29 +00:00
#ifdef __clang__
2020-12-15 19:56:47 +00:00
#pragma clang diagnostic pop
2020-12-16 15:34:29 +00:00
#endif
2018-01-13 04:43:10 +00:00
/** Methods for reading floating point numbers from text with decimal representation.
* There are "precise", "fast" and "simple" implementations.
2018-01-13 22:00:15 +00:00
*
* Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
*
2018-01-13 04:43:10 +00:00
* Precise method always returns a number that is the closest machine representable number to the input.
2018-01-13 22:00:15 +00:00
*
* Fast method is faster (up to 3 times) and usually return the same value,
2018-01-13 22:08:06 +00:00
* but in rare cases result may differ by lest significant bit (for Float32)
* and by up to two least significant bits (for Float64) from precise method.
2018-01-13 22:00:15 +00:00
* Also fast method may parse some garbage as some other unspecified garbage.
*
* Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
* It's not recommended to use simple method and it is left only for reference.
2018-01-13 21:03:22 +00:00
*
* For performance test, look at 'read_float_perf' test.
*
2018-01-13 22:00:15 +00:00
* For precision test.
* Parse all existing Float32 numbers:
2018-01-13 21:03:22 +00:00
CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);
WITH
toFloat32(toString(x)) AS y,
reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
abs(bin_x - bin_y) AS diff
SELECT
diff,
count()
FROM test.floats
WHERE NOT isNaN(x)
GROUP BY diff
ORDER BY diff ASC
2018-01-13 22:00:15 +00:00
LIMIT 100
* Here are the results:
*
Precise:
diffcount()
0 4278190082
(100% roundtrip property)
Fast:
diffcount()
0 3685260580
1 592929502
(The difference is 1 in least significant bit in 13.8% of numbers.)
Simple:
diffcount()
0 2169879994
1 1807178292
2 269505944
3 28826966
4 2566488
5 212878
6 18276
7 1214
8 30
* Parse random Float64 numbers:
WITH
rand64() AS bin_x,
reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
toFloat64(toString(x)) AS y,
reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
abs(bin_x - bin_y) AS diff
SELECT
diff,
count()
FROM numbers(100000000)
WHERE NOT isNaN(x)
GROUP BY diff
ORDER BY diff ASC
2018-01-13 21:03:22 +00:00
LIMIT 100
2018-01-13 04:43:10 +00:00
*/
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_PARSE_NUMBER;
}
/// Returns true, iff parsed.
bool parseInfinity(ReadBuffer & buf);
bool parseNaN(ReadBuffer & buf);
void assertInfinity(ReadBuffer & buf);
void assertNaN(ReadBuffer & buf);
template <bool throw_exception>
bool assertOrParseInfinity(ReadBuffer & buf)
{
if constexpr (throw_exception)
{
assertInfinity(buf);
return true;
}
else
return parseInfinity(buf);
}
template <bool throw_exception>
bool assertOrParseNaN(ReadBuffer & buf)
{
if constexpr (throw_exception)
{
assertNaN(buf);
return true;
}
else
return parseNaN(buf);
}
template <typename T, typename ReturnType>
ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
2018-01-13 04:43:10 +00:00
{
2020-12-05 16:45:22 +00:00
static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextFastFloatImpl must be float or double");
static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); //-V590
2018-01-13 04:43:10 +00:00
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
/// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
static constexpr int MAX_LENGTH = 316;
2018-01-13 04:43:10 +00:00
if (likely(!buf.eof() && buf.position() + MAX_LENGTH <= buf.buffer().end()))
{
2020-12-07 16:26:38 +00:00
auto initial_position = buf.position();
auto res = fast_float::from_chars(initial_position, buf.buffer().end(), x);
if (unlikely(res.ec != std::errc()))
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
else
return ReturnType(false);
}
2018-01-13 04:43:10 +00:00
buf.position() += res.ptr - initial_position;
2018-01-13 04:43:10 +00:00
return ReturnType(true);
}
else
2018-01-13 04:43:10 +00:00
{
/// Slow path. Copy characters that may be present in floating point number to temporary buffer.
bool negative = false;
2020-04-15 03:32:33 +00:00
2020-12-07 16:26:38 +00:00
/// We check eof here because we can parse +inf +nan
while (!buf.eof())
2020-12-05 16:45:22 +00:00
{
switch (*buf.position())
2018-01-13 04:43:10 +00:00
{
case '+':
2020-12-07 16:26:38 +00:00
++buf.position();
continue;
2018-01-13 04:43:10 +00:00
2020-12-07 16:26:38 +00:00
case '-':
{
negative = true;
++buf.position();
continue;
2018-01-13 04:43:10 +00:00
}
case 'i': [[fallthrough]];
2020-12-07 16:26:38 +00:00
case 'I':
{
if (assertOrParseInfinity<throw_exception>(buf))
{
x = std::numeric_limits<T>::infinity();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
}
case 'n': [[fallthrough]];
2020-12-07 16:26:38 +00:00
case 'N':
{
if (assertOrParseNaN<throw_exception>(buf))
{
x = std::numeric_limits<T>::quiet_NaN();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
2018-01-13 04:43:10 +00:00
}
default:
break;
2018-01-13 04:43:10 +00:00
}
break;
}
char tmp_buf[MAX_LENGTH];
int num_copied_chars = 0;
while (!buf.eof() && num_copied_chars < MAX_LENGTH)
{
char c = *buf.position();
if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E'))
2018-01-13 04:43:10 +00:00
break;
tmp_buf[num_copied_chars] = c;
++buf.position();
++num_copied_chars;
}
auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
if (unlikely(res.ec != std::errc()))
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
2018-01-13 04:43:10 +00:00
else
return ReturnType(false);
2018-01-13 04:43:10 +00:00
}
if (negative)
x = -x;
2018-01-13 04:43:10 +00:00
return ReturnType(true);
2018-01-13 04:43:10 +00:00
}
}
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
2020-12-14 02:20:00 +00:00
static inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept
{
return (((val & 0xF0F0F0F0F0F0F0F0) | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333);
}
2020-12-14 02:20:00 +00:00
static inline bool is_made_of_eight_digits_fast(const char * chars) noexcept
{
uint64_t val;
::memcpy(&val, chars, 8);
return is_made_of_eight_digits_fast(val);
}
2018-01-13 04:43:10 +00:00
template <size_t N, typename T>
2018-01-13 22:11:27 +00:00
static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
2018-01-13 04:43:10 +00:00
{
2018-01-13 19:14:08 +00:00
/// In optimistic case we can skip bound checking for first loop.
2018-01-13 19:13:26 +00:00
if (buf.position() + N <= buf.buffer().end())
2018-01-13 04:43:10 +00:00
{
2018-01-13 19:13:26 +00:00
for (size_t i = 0; i < N; ++i)
2018-01-13 04:43:10 +00:00
{
if (isNumericASCII(*buf.position()))
2018-01-13 05:40:40 +00:00
{
x *= 10;
x += *buf.position() & 0x0F;
2018-01-13 19:13:26 +00:00
++buf.position();
2018-01-13 05:40:40 +00:00
}
2018-01-13 19:13:26 +00:00
else
return;
}
}
else
{
for (size_t i = 0; i < N; ++i)
{
if (!buf.eof() && isNumericASCII(*buf.position()))
2018-01-13 19:13:26 +00:00
{
x *= 10;
x += *buf.position() & 0x0F;
++buf.position();
}
else
return;
2018-01-13 04:43:10 +00:00
}
}
2018-01-13 19:13:26 +00:00
while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) &&
is_made_of_eight_digits_fast(buf.position()))
{
buf.position() += 8;
2018-01-13 04:43:10 +00:00
}
while (!buf.eof() && isNumericASCII(*buf.position()))
++buf.position();
2018-01-13 04:43:10 +00:00
}
template <typename T, typename ReturnType>
ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
{
static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
2019-06-16 18:12:14 +00:00
static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); //-V590
2018-01-13 18:45:57 +00:00
2018-01-13 04:43:10 +00:00
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
2018-01-13 21:22:05 +00:00
bool negative = false;
x = 0;
2018-01-13 21:22:05 +00:00
UInt64 before_point = 0;
2018-01-13 04:43:10 +00:00
UInt64 after_point = 0;
2018-01-13 18:01:31 +00:00
int after_point_exponent = 0;
int exponent = 0;
2018-01-13 04:43:10 +00:00
2018-01-13 21:22:05 +00:00
if (in.eof())
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
2018-01-13 21:22:05 +00:00
else
return false;
}
if (*in.position() == '-')
{
negative = true;
++in.position();
}
auto count_after_sign = in.count();
constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
2018-01-13 22:00:15 +00:00
readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);
2018-01-13 21:22:05 +00:00
int read_digits = in.count() - count_after_sign;
2018-01-13 04:43:10 +00:00
2018-01-13 21:26:08 +00:00
if (unlikely(read_digits > significant_digits))
{
int before_point_additional_exponent = read_digits - significant_digits;
x = shift10(before_point, before_point_additional_exponent);
}
2018-01-13 18:45:57 +00:00
else
{
2018-01-13 21:26:08 +00:00
x = before_point;
2018-01-13 18:45:57 +00:00
/// Shortcut for the common case when there is an integer that fit in Int64.
if (read_digits && (in.eof() || *in.position() < '.'))
{
2018-01-13 21:22:05 +00:00
if (negative)
x = -x;
2018-01-13 18:45:57 +00:00
return ReturnType(true);
}
}
2018-01-13 04:43:10 +00:00
if (checkChar('.', in))
{
auto after_point_count = in.count();
2018-01-13 22:00:15 +00:00
while (!in.eof() && *in.position() == '0')
++in.position();
auto after_leading_zeros_count = in.count();
auto after_point_num_leading_zeros = after_leading_zeros_count - after_point_count;
readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
2018-08-26 02:13:41 +00:00
read_digits = in.count() - after_leading_zeros_count;
2018-01-13 22:00:15 +00:00
after_point_exponent = (read_digits > significant_digits ? -significant_digits : -read_digits) - after_point_num_leading_zeros;
2018-01-13 04:43:10 +00:00
}
if (checkChar('e', in) || checkChar('E', in))
{
if (in.eof())
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value: nothing after exponent", ErrorCodes::CANNOT_PARSE_NUMBER);
else
return false;
}
bool exponent_negative = false;
if (*in.position() == '-')
{
exponent_negative = true;
++in.position();
}
else if (*in.position() == '+')
{
++in.position();
}
2018-01-13 22:00:15 +00:00
readUIntTextUpToNSignificantDigits<4>(exponent, in);
2018-01-13 21:22:05 +00:00
if (exponent_negative)
exponent = -exponent;
2018-01-13 04:43:10 +00:00
}
if (after_point)
2018-01-13 21:22:05 +00:00
x += shift10(after_point, after_point_exponent);
2018-01-13 04:43:10 +00:00
if (exponent)
2018-01-13 18:01:31 +00:00
x = shift10(x, exponent);
2018-01-13 04:43:10 +00:00
2018-01-13 21:22:05 +00:00
if (negative)
x = -x;
2018-01-13 04:43:10 +00:00
2018-01-13 21:22:05 +00:00
auto num_characters_without_sign = in.count() - count_after_sign;
2018-01-13 04:43:10 +00:00
/// Denormals. At most one character is read before denormal and it is '-'.
2018-01-13 21:22:05 +00:00
if (num_characters_without_sign == 0)
2018-01-13 04:43:10 +00:00
{
2018-01-13 21:22:05 +00:00
if (in.eof())
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value: no digits read", ErrorCodes::CANNOT_PARSE_NUMBER);
2018-01-13 21:22:05 +00:00
else
return false;
}
2018-01-13 04:43:10 +00:00
2020-04-15 03:32:33 +00:00
if (*in.position() == '+')
{
++in.position();
if (in.eof())
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value: nothing after plus sign", ErrorCodes::CANNOT_PARSE_NUMBER);
2020-04-15 03:32:33 +00:00
else
return false;
}
else if (negative)
{
if constexpr (throw_exception)
2020-12-10 17:26:36 +00:00
throw ParsingException("Cannot read floating point value: plus after minus sign", ErrorCodes::CANNOT_PARSE_NUMBER);
2020-04-15 03:32:33 +00:00
else
return false;
}
}
2018-01-13 04:43:10 +00:00
if (*in.position() == 'i' || *in.position() == 'I')
{
if (assertOrParseInfinity<throw_exception>(in))
{
x = std::numeric_limits<T>::infinity();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
}
else if (*in.position() == 'n' || *in.position() == 'N')
{
if (assertOrParseNaN<throw_exception>(in))
{
x = std::numeric_limits<T>::quiet_NaN();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
}
}
return ReturnType(true);
}
template <typename T, typename ReturnType>
ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
bool negative = false;
x = 0;
bool after_point = false;
double power_of_ten = 1;
if (buf.eof())
throwReadAfterEOF();
while (!buf.eof())
{
switch (*buf.position())
{
case '+':
break;
case '-':
negative = true;
break;
case '.':
after_point = true;
break;
case '0': [[fallthrough]];
case '1': [[fallthrough]];
case '2': [[fallthrough]];
case '3': [[fallthrough]];
case '4': [[fallthrough]];
case '5': [[fallthrough]];
case '6': [[fallthrough]];
case '7': [[fallthrough]];
case '8': [[fallthrough]];
case '9':
if (after_point)
{
power_of_ten /= 10;
x += (*buf.position() - '0') * power_of_ten;
}
else
{
x *= 10;
x += *buf.position() - '0';
}
break;
case 'e': [[fallthrough]];
case 'E':
{
++buf.position();
Int32 exponent = 0;
readIntText(exponent, buf);
2018-01-13 18:01:31 +00:00
x = shift10(x, exponent);
2018-01-13 04:43:10 +00:00
if (negative)
x = -x;
2018-01-13 05:20:18 +00:00
return ReturnType(true);
2018-01-13 04:43:10 +00:00
}
case 'i': [[fallthrough]];
case 'I':
{
if (assertOrParseInfinity<throw_exception>(buf))
{
x = std::numeric_limits<T>::infinity();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
}
case 'n': [[fallthrough]];
case 'N':
{
if (assertOrParseNaN<throw_exception>(buf))
{
x = std::numeric_limits<T>::quiet_NaN();
if (negative)
x = -x;
return ReturnType(true);
}
return ReturnType(false);
}
default:
{
if (negative)
x = -x;
2018-01-13 05:20:18 +00:00
return ReturnType(true);
2018-01-13 04:43:10 +00:00
}
}
++buf.position();
}
if (negative)
x = -x;
2018-01-13 05:20:18 +00:00
return ReturnType(true);
2018-01-13 04:43:10 +00:00
}
template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
/// Implementation that is selected as default.
template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
}