ClickHouse/dbms/src/IO/readFloatText.h

#include <type_traits>
#include <IO/ReadHelpers.h>
#include <Core/Defines.h>
#include <common/shift10.h>
#include <common/likely.h>
#include <Common/StringUtils/StringUtils.h>
#include <double-conversion/double-conversion.h>


/** Methods for reading floating point numbers from text with decimal representation.
  * There are "precise", "fast" and "simple" implementations.
  *
  * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
  *
  * Precise method always returns a number that is the closest machine representable number to the input.
  *
  * Fast method is faster (up to 3 times) and usually return the same value,
  *  but in rare cases result may differ by lest significant bit (for Float32)
  *  and by up to two least significant bits (for Float64) from precise method.
  * Also fast method may parse some garbage as some other unspecified garbage.
  *
  * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
  * It's not recommended to use simple method and it is left only for reference.
  *
  * For performance test, look at 'read_float_perf' test.
  *
  * For precision test.
  * Parse all existing Float32 numbers:

CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);

WITH
    toFloat32(toString(x)) AS y,
    reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
    reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
    abs(bin_x - bin_y) AS diff
SELECT
    diff,
    count()
FROM test.floats
WHERE NOT isNaN(x)
GROUP BY diff
ORDER BY diff ASC
LIMIT 100

  * Here are the results:
  *
    Precise:
    ┌─diff─┬────count()─┐
    │    0 │ 4278190082 │
    └──────┴────────────┘
    (100% roundtrip property)

    Fast:
    ┌─diff─┬────count()─┐
    │    0 │ 3685260580 │
    │    1 │  592929502 │
    └──────┴────────────┘
    (The difference is 1 in least significant bit in 13.8% of numbers.)

    Simple:
    ┌─diff─┬────count()─┐
    │    0 │ 2169879994 │
    │    1 │ 1807178292 │
    │    2 │  269505944 │
    │    3 │   28826966 │
    │    4 │    2566488 │
    │    5 │     212878 │
    │    6 │      18276 │
    │    7 │       1214 │
    │    8 │         30 │
    └──────┴────────────┘

  * Parse random Float64 numbers:

WITH
    rand64() AS bin_x,
    reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
    toFloat64(toString(x)) AS y,
    reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
    abs(bin_x - bin_y) AS diff
SELECT
    diff,
    count()
FROM numbers(100000000)
WHERE NOT isNaN(x)
GROUP BY diff
ORDER BY diff ASC
LIMIT 100

  */


namespace DB
{

namespace ErrorCodes
{
    extern const int CANNOT_PARSE_NUMBER;
    extern const int ARGUMENT_OUT_OF_BOUND;
}


/// Returns true, iff parsed.
bool parseInfinity(ReadBuffer & buf);
bool parseNaN(ReadBuffer & buf);

void assertInfinity(ReadBuffer & buf);
void assertNaN(ReadBuffer & buf);


template <bool throw_exception>
bool assertOrParseInfinity(ReadBuffer & buf)
{
    if constexpr (throw_exception)
    {
        assertInfinity(buf);
        return true;
    }
    else
        return parseInfinity(buf);
}

template <bool throw_exception>
bool assertOrParseNaN(ReadBuffer & buf)
{
    if constexpr (throw_exception)
    {
        assertNaN(buf);
        return true;
    }
    else
        return parseNaN(buf);
}


/// Some garbage may be successfully parsed, examples: '--1' parsed as '1'.
template <typename T, typename ReturnType>
ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
{
    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;

    if (buf.eof())
    {
        if constexpr (throw_exception)
            throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
        else
            return ReturnType(false);
    }

    /// We use special code to read denormals (inf, nan), because we support slightly more variants that double-conversion library does:
    /// Example: inf and Infinity.

    bool negative = false;

    while (true)
    {
        switch (*buf.position())
        {
            case '-':
            {
                negative = true;
                ++buf.position();
                continue;
            }

            case 'i': [[fallthrough]];
            case 'I':
            {
                if (assertOrParseInfinity<throw_exception>(buf))
                {
                    x = std::numeric_limits<T>::infinity();
                    if (negative)
                        x = -x;
                    return ReturnType(true);
                }
                return ReturnType(false);
            }

            case 'n': [[fallthrough]];
            case 'N':
            {
                if (assertOrParseNaN<throw_exception>(buf))
                {
                    x = std::numeric_limits<T>::quiet_NaN();
                    if (negative)
                        x = -x;
                    return ReturnType(true);
                }
                return ReturnType(false);
            }

            default:
                break;
        }
        break;
    }

    static const double_conversion::StringToDoubleConverter converter(
        double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK,
        0, 0, nullptr, nullptr);

    /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
    static constexpr int MAX_LENGTH = 316;

    if (buf.position() + MAX_LENGTH <= buf.buffer().end())
    {
        int num_processed_characters = 0;

        if constexpr (std::is_same_v<T, double>)
            x = converter.StringToDouble(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters);
        else
            x = converter.StringToFloat(buf.position(), buf.buffer().end() - buf.position(), &num_processed_characters);

        if (num_processed_characters < 0)
        {
            if constexpr (throw_exception)
                throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
            else
                return ReturnType(false);
        }

        buf.position() += num_processed_characters;

        if (negative)
            x = -x;
        return ReturnType(true);
    }
    else
    {
        /// Slow path. Copy characters that may be present in floating point number to temporary buffer.

        char tmp_buf[MAX_LENGTH];
        int num_copied_chars = 0;

        while (!buf.eof() && num_copied_chars < MAX_LENGTH)
        {
            char c = *buf.position();
            if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E'))
                break;

            tmp_buf[num_copied_chars] = c;
            ++buf.position();
            ++num_copied_chars;
        }

        int num_processed_characters = 0;

        if constexpr (std::is_same_v<T, double>)
            x = converter.StringToDouble(tmp_buf, num_copied_chars, &num_processed_characters);
        else
            x = converter.StringToFloat(tmp_buf, num_copied_chars, &num_processed_characters);

        if (num_processed_characters < num_copied_chars)
        {
            if constexpr (throw_exception)
                throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
            else
                return ReturnType(false);
        }

        if (negative)
            x = -x;
        return ReturnType(true);
    }
}


template <size_t N, typename T>
static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
{
    /// In optimistic case we can skip bound checking for first loop.
    if (buf.position() + N <= buf.buffer().end())
    {
        for (size_t i = 0; i < N; ++i)
        {
            if (isNumericASCII(*buf.position()))
            {
                x *= 10;
                x += *buf.position() & 0x0F;
                ++buf.position();
            }
            else
                return;
        }

        while (!buf.eof() && isNumericASCII(*buf.position()))
            ++buf.position();
    }
    else
    {
        for (size_t i = 0; i < N; ++i)
        {
            if (!buf.eof() && isNumericASCII(*buf.position()))
            {
                x *= 10;
                x += *buf.position() & 0x0F;
                ++buf.position();
            }
            else
                return;
        }

        while (!buf.eof() && isNumericASCII(*buf.position()))
            ++buf.position();
    }
}


template <typename T, typename ReturnType>
ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
{
    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
    static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); //-V501

    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;

    bool negative = false;
    x = 0;
    UInt64 before_point = 0;
    UInt64 after_point = 0;
    int after_point_exponent = 0;
    int exponent = 0;

    if (in.eof())
    {
        if constexpr (throw_exception)
            throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
        else
            return false;
    }

    if (*in.position() == '-')
    {
        negative = true;
        ++in.position();
    }

    auto count_after_sign = in.count();

    constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
    readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);

    int read_digits = in.count() - count_after_sign;

    if (unlikely(read_digits > significant_digits))
    {
        int before_point_additional_exponent = read_digits - significant_digits;
        x = shift10(before_point, before_point_additional_exponent);
    }
    else
    {
        x = before_point;

        /// Shortcut for the common case when there is an integer that fit in Int64.
        if (read_digits && (in.eof() || *in.position() < '.'))
        {
            if (negative)
                x = -x;
            return ReturnType(true);
        }
    }

    if (checkChar('.', in))
    {
        auto after_point_count = in.count();

        while (!in.eof() && *in.position() == '0')
            ++in.position();

        auto after_leading_zeros_count = in.count();
        auto after_point_num_leading_zeros = after_leading_zeros_count - after_point_count;

        readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
        read_digits = in.count() - after_leading_zeros_count;
        after_point_exponent = (read_digits > significant_digits ? -significant_digits : -read_digits) - after_point_num_leading_zeros;
    }

    if (checkChar('e', in) || checkChar('E', in))
    {
        if (in.eof())
        {
            if constexpr (throw_exception)
                throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
            else
                return false;
        }

        bool exponent_negative = false;
        if (*in.position() == '-')
        {
            exponent_negative = true;
            ++in.position();
        }
        else if (*in.position() == '+')
        {
            ++in.position();
        }

        readUIntTextUpToNSignificantDigits<4>(exponent, in);
        if (exponent_negative)
            exponent = -exponent;
    }

    if (after_point)
        x += shift10(after_point, after_point_exponent);

    if (exponent)
        x = shift10(x, exponent);

    if (negative)
        x = -x;

    auto num_characters_without_sign = in.count() - count_after_sign;

    /// Denormals. At most one character is read before denormal and it is '-'.
    if (num_characters_without_sign == 0)
    {
        if (in.eof())
        {
            if constexpr (throw_exception)
                throw Exception("Cannot read floating point value", ErrorCodes::CANNOT_PARSE_NUMBER);
            else
                return false;
        }

        if (*in.position() == 'i' || *in.position() == 'I')
        {
            if (assertOrParseInfinity<throw_exception>(in))
            {
                x = std::numeric_limits<T>::infinity();
                if (negative)
                    x = -x;
                return ReturnType(true);
            }
            return ReturnType(false);
        }
        else if (*in.position() == 'n' || *in.position() == 'N')
        {
            if (assertOrParseNaN<throw_exception>(in))
            {
                x = std::numeric_limits<T>::quiet_NaN();
                if (negative)
                    x = -x;
                return ReturnType(true);
            }
            return ReturnType(false);
        }
    }

    return ReturnType(true);
}


template <typename T, typename ReturnType>
ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
{
    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;

    bool negative = false;
    x = 0;
    bool after_point = false;
    double power_of_ten = 1;

    if (buf.eof())
        throwReadAfterEOF();

    while (!buf.eof())
    {
        switch (*buf.position())
        {
            case '+':
                break;
            case '-':
                negative = true;
                break;
            case '.':
                after_point = true;
                break;
            case '0': [[fallthrough]];
            case '1': [[fallthrough]];
            case '2': [[fallthrough]];
            case '3': [[fallthrough]];
            case '4': [[fallthrough]];
            case '5': [[fallthrough]];
            case '6': [[fallthrough]];
            case '7': [[fallthrough]];
            case '8': [[fallthrough]];
            case '9':
                if (after_point)
                {
                    power_of_ten /= 10;
                    x += (*buf.position() - '0') * power_of_ten;
                }
                else
                {
                    x *= 10;
                    x += *buf.position() - '0';
                }
                break;
            case 'e': [[fallthrough]];
            case 'E':
            {
                ++buf.position();
                Int32 exponent = 0;
                readIntText(exponent, buf);
                x = shift10(x, exponent);
                if (negative)
                    x = -x;
                return ReturnType(true);
            }

            case 'i': [[fallthrough]];
            case 'I':
            {
                if (assertOrParseInfinity<throw_exception>(buf))
                {
                    x = std::numeric_limits<T>::infinity();
                    if (negative)
                        x = -x;
                    return ReturnType(true);
                }
                return ReturnType(false);
            }

            case 'n': [[fallthrough]];
            case 'N':
            {
                if (assertOrParseNaN<throw_exception>(buf))
                {
                    x = std::numeric_limits<T>::quiet_NaN();
                    if (negative)
                        x = -x;
                    return ReturnType(true);
                }
                return ReturnType(false);
            }

            default:
            {
                if (negative)
                    x = -x;
                return ReturnType(true);
            }
        }
        ++buf.position();
    }

    if (negative)
        x = -x;

    return ReturnType(true);
}


template <bool _throw_on_error, typename T>
inline bool readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exponent, bool digits_only = false)
{
    x = 0;
    exponent = 0;
    unsigned int max_digits = digits;
    digits = 0;
    unsigned int places = 0;
    typename T::NativeType sign = 1;
    bool leading_zeroes = true;
    bool after_point = false;

    if (buf.eof())
    {
        if constexpr (_throw_on_error)
            throwReadAfterEOF();
        return false;
    }

    if (!buf.eof())
    {
        switch (*buf.position())
        {
            case '-':
                sign = -1;
                [[fallthrough]];
            case '+':
                ++buf.position();
                break;
        }
    }

    bool stop = false;
    while (!buf.eof() && !stop)
    {
        const char & byte = *buf.position();
        switch (byte)
        {
            case '.':
                after_point = true;
                leading_zeroes = false;
                break;
            case '0':
            {
                if (leading_zeroes)
                    break;

                if (after_point)
                {
                    ++places; /// Count trailing zeroes. They would be used only if there's some other digit after them.
                    break;
                }
                [[fallthrough]];
            }
            case '1': [[fallthrough]];
            case '2': [[fallthrough]];
            case '3': [[fallthrough]];
            case '4': [[fallthrough]];
            case '5': [[fallthrough]];
            case '6': [[fallthrough]];
            case '7': [[fallthrough]];
            case '8': [[fallthrough]];
            case '9':
            {
                leading_zeroes = false;

                ++places; // num zeroes before + current digit
                if (digits + places > max_digits)
                {
                    if constexpr (_throw_on_error)
                        throw Exception("Too many digits (" + std::to_string(digits + places) + " > " + std::to_string(max_digits)
                            + ") in decimal value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
                    return false;
                }

                digits += places;
                if (after_point)
                    exponent -= places;

                // TODO: accurate shift10 for big integers
                for (; places; --places)
                    x *= 10;
                x += (byte - '0');
                break;
            }
            case 'e': [[fallthrough]];
            case 'E':
            {
                ++buf.position();
                Int32 addition_exp = 0;
                readIntText(addition_exp, buf);
                exponent += addition_exp;
                stop = true;
                continue;
            }

            default:
                if (digits_only)
                {
                    if constexpr (_throw_on_error)
                        throw Exception("Unexpected symbol while reading decimal", ErrorCodes::CANNOT_PARSE_NUMBER);
                    return false;
                }
                stop = true;
                continue;
        }
        ++buf.position();
    }

    x *= sign;
    return true;
}

template <typename T>
inline void readDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale, bool digits_only = false)
{
    unsigned int digits = precision;
    int exponent;
    readDigits<true>(buf, x, digits, exponent, digits_only);

    if (static_cast<int>(digits) + exponent > static_cast<int>(precision - scale))
        throw Exception("Decimal value is too big", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
    if (static_cast<int>(scale) + exponent < 0)
        throw Exception("Decimal value is too small", ErrorCodes::ARGUMENT_OUT_OF_BOUND);

    scale += exponent;
}

template <typename T>
inline bool tryReadDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale)
{
    unsigned int digits = precision;
    int exponent;

    if (!readDigits<false>(buf, x, digits, exponent, true) ||
        static_cast<int>(digits) + exponent > static_cast<int>(precision - scale) ||
        static_cast<int>(scale) + exponent < 0)
        return false;

    scale += exponent;
    return true;
}

template <typename T>
inline void readCSVDecimalText(ReadBuffer & buf, T & x, unsigned int precision, unsigned int & scale)
{
    if (buf.eof())
        throwReadAfterEOF();

    char maybe_quote = *buf.position();

    if (maybe_quote == '\'' || maybe_quote == '\"')
        ++buf.position();

    readDecimalText(buf, x, precision, scale, false);

    if (maybe_quote == '\'' || maybe_quote == '\"')
        assertChar(maybe_quote, buf);
}

template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }

template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }

template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }


/// Implementation that is selected as default.

template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }


}