#include #include #include #include #include #include #include /** Methods for reading floating point numbers from text with decimal representation. * There are "precise", "fast" and "simple" implementations. * * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign. * * Precise method always returns a number that is the closest machine representable number to the input. * * Fast method is faster (up to 3 times) and usually return the same value, * but in rare cases result may differ by lest significant bit (for Float32) * and by up to two least significant bits (for Float64) from precise method. * Also fast method may parse some garbage as some other unspecified garbage. * * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases. * It's not recommended to use simple method and it is left only for reference. * * For performance test, look at 'read_float_perf' test. * * For precision test. * Parse all existing Float32 numbers: CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000); WITH toFloat32(toString(x)) AS y, reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x, reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y, abs(bin_x - bin_y) AS diff SELECT diff, count() FROM test.floats WHERE NOT isNaN(x) GROUP BY diff ORDER BY diff ASC LIMIT 100 * Here are the results: * Precise: ┌─diff─┬────count()─┐ │ 0 │ 4278190082 │ └──────┴────────────┘ (100% roundtrip property) Fast: ┌─diff─┬────count()─┐ │ 0 │ 3685260580 │ │ 1 │ 592929502 │ └──────┴────────────┘ (The difference is 1 in least significant bit in 13.8% of numbers.) Simple: ┌─diff─┬────count()─┐ │ 0 │ 2169879994 │ │ 1 │ 1807178292 │ │ 2 │ 269505944 │ │ 3 │ 28826966 │ │ 4 │ 2566488 │ │ 5 │ 212878 │ │ 6 │ 18276 │ │ 7 │ 1214 │ │ 8 │ 30 │ └──────┴────────────┘ * Parse random Float64 numbers: WITH rand64() AS bin_x, reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x, toFloat64(toString(x)) AS y, reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y, abs(bin_x - bin_y) AS diff SELECT diff, count() FROM numbers(100000000) WHERE NOT isNaN(x) GROUP BY diff ORDER BY diff ASC LIMIT 100 */ namespace DB { namespace ErrorCodes { extern const int CANNOT_PARSE_NUMBER; extern const int ARGUMENT_OUT_OF_BOUND; } /// Returns true, iff parsed. bool parseInfinity(ReadBuffer & buf); bool parseNaN(ReadBuffer & buf); void assertInfinity(ReadBuffer & buf); void assertNaN(ReadBuffer & buf); template bool assertOrParseInfinity(ReadBuffer & buf) { if constexpr (throw_exception) { assertInfinity(buf); return true; } else return parseInfinity(buf); } template bool assertOrParseNaN(ReadBuffer & buf) { if constexpr (throw_exception) { assertNaN(buf); return true; } else return parseNaN(buf); } /// Some garbage may be successfully parsed, examples: '--1' parsed as '1'. template ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); static constexpr bool throw_exception = std::is_same_v; if (buf.eof()) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return ReturnType(false); } /// We use special code to read denormals (inf, nan), because we support slightly more variants that double-conversion library does: /// Example: inf and Infinity. bool negative = false; while (true) { switch (*buf.position()) { case '-': { negative = true; ++buf.position(); continue; } case 'i': [[fallthrough]]; case 'I': { if (assertOrParseInfinity(buf)) { x = std::numeric_limits::infinity(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } case 'n': [[fallthrough]]; case 'N': { if (assertOrParseNaN(buf)) { x = std::numeric_limits::quiet_NaN(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } default: break; } break; } static const double_conversion::StringToDoubleConverter converter( double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK, 0, 0, nullptr, nullptr); /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes. static constexpr int MAX_LENGTH = 316; if (buf.position() + MAX_LENGTH <= buf.buffer().end()) { int num_processed_characters = 0; if constexpr (std::is_same_v) x = converter.StringToDouble(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters); else x = converter.StringToFloat(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters); if (num_processed_characters < 0) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return ReturnType(false); } buf.position() += num_processed_characters; if (negative) x = -x; return ReturnType(true); } else { /// Slow path. Copy characters that may be present in floating point number to temporary buffer. char tmp_buf[MAX_LENGTH]; int num_copied_chars = 0; while (!buf.eof() && num_copied_chars < MAX_LENGTH) { char c = *buf.position(); if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E')) break; tmp_buf[num_copied_chars] = c; ++buf.position(); ++num_copied_chars; } int num_processed_characters = 0; if constexpr (std::is_same_v) x = converter.StringToDouble(tmp_buf, num_copied_chars, &num_processed_characters); else x = converter.StringToFloat(tmp_buf, num_copied_chars, &num_processed_characters); if (num_processed_characters < num_copied_chars) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return ReturnType(false); } if (negative) x = -x; return ReturnType(true); } } template static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) { /// In optimistic case we can skip bound checking for first loop. if (buf.position() + N <= buf.buffer().end()) { for (size_t i = 0; i < N; ++i) { if (isNumericASCII(*buf.position())) { x *= 10; x += *buf.position() & 0x0F; ++buf.position(); } else return; } while (!buf.eof() && isNumericASCII(*buf.position())) ++buf.position(); } else { for (size_t i = 0; i < N; ++i) { if (!buf.eof() && isNumericASCII(*buf.position())) { x *= 10; x += *buf.position() & 0x0F; ++buf.position(); } else return; } while (!buf.eof() && isNumericASCII(*buf.position())) ++buf.position(); } } template ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); //-V501 static constexpr bool throw_exception = std::is_same_v; bool negative = false; x = 0; UInt64 before_point = 0; UInt64 after_point = 0; int after_point_exponent = 0; int exponent = 0; if (in.eof()) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return false; } if (*in.position() == '-') { negative = true; ++in.position(); } auto count_after_sign = in.count(); constexpr int significant_digits = std::numeric_limits::digits10; readUIntTextUpToNSignificantDigits(before_point, in); int read_digits = in.count() - count_after_sign; if (unlikely(read_digits > significant_digits)) { int before_point_additional_exponent = read_digits - significant_digits; x = shift10(before_point, before_point_additional_exponent); } else { x = before_point; /// Shortcut for the common case when there is an integer that fit in Int64. if (read_digits && (in.eof() || *in.position() < '.')) { if (negative) x = -x; return ReturnType(true); } } if (checkChar('.', in)) { auto after_point_count = in.count(); while (!in.eof() && *in.position() == '0') ++in.position(); auto after_leading_zeros_count = in.count(); auto after_point_num_leading_zeros = after_leading_zeros_count - after_point_count; readUIntTextUpToNSignificantDigits(after_point, in); read_digits = in.count() - after_leading_zeros_count; after_point_exponent = (read_digits > significant_digits ? -significant_digits : -read_digits) - after_point_num_leading_zeros; } if (checkChar('e', in) || checkChar('E', in)) { if (in.eof()) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return false; } bool exponent_negative = false; if (*in.position() == '-') { exponent_negative = true; ++in.position(); } else if (*in.position() == '+') { ++in.position(); } readUIntTextUpToNSignificantDigits<4>(exponent, in); if (exponent_negative) exponent = -exponent; } if (after_point) x += shift10(after_point, after_point_exponent); if (exponent) x = shift10(x, exponent); if (negative) x = -x; auto num_characters_without_sign = in.count() - count_after_sign; /// Denormals. At most one character is read before denormal and it is '-'. if (num_characters_without_sign == 0) { if (in.eof()) { if constexpr (throw_exception) throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER); else return false; } if (*in.position() == 'i' || *in.position() == 'I') { if (assertOrParseInfinity(in)) { x = std::numeric_limits::infinity(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } else if (*in.position() == 'n' || *in.position() == 'N') { if (assertOrParseNaN(in)) { x = std::numeric_limits::quiet_NaN(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } } return ReturnType(true); } template ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) { static constexpr bool throw_exception = std::is_same_v; bool negative = false; x = 0; bool after_point = false; double power_of_ten = 1; if (buf.eof()) throwReadAfterEOF(); while (!buf.eof()) { switch (*buf.position()) { case '+': break; case '-': negative = true; break; case '.': after_point = true; break; case '0': [[fallthrough]]; case '1': [[fallthrough]]; case '2': [[fallthrough]]; case '3': [[fallthrough]]; case '4': [[fallthrough]]; case '5': [[fallthrough]]; case '6': [[fallthrough]]; case '7': [[fallthrough]]; case '8': [[fallthrough]]; case '9': if (after_point) { power_of_ten /= 10; x += (*buf.position() - '0') * power_of_ten; } else { x *= 10; x += *buf.position() - '0'; } break; case 'e': [[fallthrough]]; case 'E': { ++buf.position(); Int32 exponent = 0; readIntText(exponent, buf); x = shift10(x, exponent); if (negative) x = -x; return ReturnType(true); } case 'i': [[fallthrough]]; case 'I': { if (assertOrParseInfinity(buf)) { x = std::numeric_limits::infinity(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } case 'n': [[fallthrough]]; case 'N': { if (assertOrParseNaN(buf)) { x = std::numeric_limits::quiet_NaN(); if (negative) x = -x; return ReturnType(true); } return ReturnType(false); } default: { if (negative) x = -x; return ReturnType(true); } } ++buf.position(); } if (negative) x = -x; return ReturnType(true); } template inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exponent, bool digits_only = false) { x = 0; exponent = 0; unsigned int max_digits = digits; digits = 0; unsigned int places = 0; typename T::NativeType sign = 1; bool leading_zeroes = true; bool after_point = false; if (buf.eof()) { if constexpr (_throw_on_error) throwReadAfterEOF(); return false; } if (!buf.eof()) { switch (*buf.position()) { case '-': sign = -1; [[fallthrough]]; case '+': ++buf.position(); break; } } bool stop = false; while (!buf.eof() && !stop) { const char & byte = *buf.position(); switch (byte) { case '.': after_point = true; leading_zeroes = false; break; case '0': { if (leading_zeroes) break; if (after_point) { ++places; /// Count trailing zeroes. They would be used only if there's some other digit after them. break; } [[fallthrough]]; } case '1': [[fallthrough]]; case '2': [[fallthrough]]; case '3': [[fallthrough]]; case '4': [[fallthrough]]; case '5': [[fallthrough]]; case '6': [[fallthrough]]; case '7': [[fallthrough]]; case '8': [[fallthrough]]; case '9': { leading_zeroes = false; ++places; // num zeroes before + current digit if (digits + places > max_digits) { if constexpr (_throw_on_error) throw Exception("Too many digits (" + std::to_string(digits + places) + " > " + std::to_string(max_digits) + ") in decimal value", ErrorCodes::ARGUMENT_OUT_OF_BOUND); return false; } digits += places; if (after_point) exponent -= places; // TODO: accurate shift10 for big integers for (; places; --places) x *= 10; x += (byte - '0'); break; } case 'e': [[fallthrough]]; case 'E': { ++buf.position(); Int32 addition_exp = 0; readIntText(addition_exp, buf); exponent += addition_exp; stop = true; continue; } default: if (digits_only) { if constexpr (_throw_on_error) throw Exception("Unexpected symbol while reading decimal", ErrorCodes::CANNOT_PARSE_NUMBER); return false; } stop = true; continue; } ++buf.position(); } x *= sign; return true; } template inline void readDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale, bool digits_only = false) { unsigned int digits = precision; int exponent; readDigits(buf, x, digits, exponent, digits_only); if (static_cast(digits) + exponent > static_cast(precision - scale)) throw Exception("Decimal value is too big", ErrorCodes::ARGUMENT_OUT_OF_BOUND); if (static_cast(scale) + exponent < 0) throw Exception("Decimal value is too small", ErrorCodes::ARGUMENT_OUT_OF_BOUND); scale += exponent; } template inline bool tryReadDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale) { unsigned int digits = precision; int exponent; if (!readDigits(buf, x, digits, exponent, true) || static_cast(digits) + exponent > static_cast(precision - scale) || static_cast(scale) + exponent < 0) return false; scale += exponent; return true; } template inline void readCSVDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale) { if (buf.eof()) throwReadAfterEOF(); char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); readDecimalText(buf, x, precision, scale, false); if (maybe_quote == '\'' || maybe_quote == '\"') assertChar(maybe_quote, buf); } template void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl(x, in); } template bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl(x, in); } template void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl(x, in); } template bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } template void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl(x, in); } template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl(x, in); } /// Implementation that is selected as default. template void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } }