Merge branch 'master' into revert-revert-drop-cache

2024-11-25 09:02:00 +00:00 · 2023-08-02 10:54:49 +02:00 · 2023-08-02 10:54:49 +02:00 · 7d609ca21d
commit 7d609ca21d
parent eabca22909 61ae0792b6
15 changed files with 155 additions and 20 deletions
--- a/docker/test/integration/runner/Dockerfile
+++ b/docker/test/integration/runner/Dockerfile
@ -95,6 +95,7 @@ RUN python3 -m pip install --no-cache-dir \
    pytest-timeout \
    pytest-xdist \
    pytz \
+    pyyaml==5.3.1 \
    redis \
    requests-kerberos \
    tzlocal==2.1 \
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -4578,3 +4578,28 @@ Type: Int64

 Default: 0

+## precise_float_parsing {#precise_float_parsing}
+
+Switches [Float32/Float64](../../sql-reference/data-types/float.md) parsing algorithms:
+* If the value is `1`, then precise method is used. It is slower than fast method, but it always returns a number that is the closest machine representable number to the input.
+* Otherwise, fast method is used (default). It usually returns the same value as precise, but in rare cases result may differ by one or two least significant digits.
+
+Possible values: `0`, `1`.
+
+Default value: `0`.
+
+Example:
+
+```sql
+SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 0;
+
+┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐
+│  1.7090999999999998 │       15008753.000000002 │
+└─────────────────────┴──────────────────────────┘
+
+SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 1;
+
+┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐
+│              1.7091 │                 15008753 │
+└─────────────────────┴──────────────────────────┘
+```
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -4213,3 +4213,29 @@ SELECT *, timezone() FROM test_tz WHERE d = '2000-01-01 00:00:00' SETTINGS sessi
 - Запрос: `SELECT * FROM file('sample.csv')`

 Если чтение и обработка `sample.csv` прошли успешно, файл будет переименован в `processed_sample_1683473210851438.csv`.
+
+## precise_float_parsing {#precise_float_parsing}
+
+Позволяет выбрать алгоритм, используемый при парсинге [Float32/Float64](../../sql-reference/data-types/float.md):
+* Если установлено значение `1`, то используется точный метод. Он более медленный, но всегда возвращает число, наиболее близкое к входному значению.
+* В противном случае используется быстрый метод (поведение по умолчанию). Обычно результат его работы совпадает с результатом, полученным точным методом, однако в редких случаях он может отличаться на 1 или 2 наименее значимых цифры.
+
+Возможные значения: `0`, `1`.
+
+Значение по умолчанию: `0`.
+
+Пример:
+
+```sql
+SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 0;
+
+┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐
+│  1.7090999999999998 │       15008753.000000002 │
+└─────────────────────┴──────────────────────────┘
+
+SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 1;
+
+┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐
+│              1.7091 │                 15008753 │
+└─────────────────────┴──────────────────────────┘
+```
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@ -45,6 +45,25 @@ size_t shortest_literal_length(const Literals & literals)
    return shortest;
 }

+const char * skipNameCapturingGroup(const char * pos, size_t offset, const char * end)
+{
+    const char special = *(pos + offset) == '<' ? '>' : '\'';
+    offset ++;
+    while (pos + offset < end)
+    {
+        const char cur = *(pos + offset);
+        if (cur == special)
+        {
+            return pos + offset;
+        }
+        if (('0' <= cur && cur <= '9') || ('a' <= cur && cur <= 'z') || ('A' <= cur && cur <= 'Z'))
+            offset ++;
+        else
+            return pos;
+    }
+    return pos;
+}
+
 const char * analyzeImpl(
    std::string_view regexp,
    const char * pos,
@ -247,10 +266,15 @@ const char * analyzeImpl(
                                break;
                        }
                    }
+                    /// (?:regex) means non-capturing parentheses group
                    if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
                    {
                        pos += 2;
                    }
+                    if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
+                    {
+                        pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
+                    }
                    Literal group_required_substr;
                    bool group_is_trival = true;
                    Literals group_alters;
--- a/src/Common/tests/gtest_optimize_re.cpp
+++ b/src/Common/tests/gtest_optimize_re.cpp
@ -47,4 +47,8 @@ TEST(OptimizeRE, analyze)
    test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
    test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
    test_f(R"(\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z)", "/k8s1");
+    test_f("[a-zA-Z]+(?P<num>\\d+)", "");
+    test_f("[a-zA-Z]+(?<num>\\d+)", "");
+    test_f("[a-zA-Z]+(?'num'\\d+)", "");
+    test_f("[a-zA-Z]+(?x<num>\\d+)", "x<num>");
 }
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -1031,7 +1031,8 @@ class IColumn;
    M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
    \
    M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
-    M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \
+    M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0)                                                         \
+    M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \

 // End of FORMAT_FACTORY_SETTINGS
 // Please add settings non-related to formats into the COMMON_SETTINGS above.
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -1040,13 +1040,21 @@ inline void convertFromTime<DataTypeDateTime>(DataTypeDateTime::FieldType & x, t
 /** Conversion of strings to numbers, dates, datetimes: through parsing.
  */
 template <typename DataType>
-void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing)
 {
-    readText(x, rb);
+    if constexpr (std::is_floating_point_v<typename DataType::FieldType>)
+    {
+        if (precise_float_parsing)
+            readFloatTextPrecise(x, rb);
+        else
+            readFloatTextFast(x, rb);
+    }
+    else
+        readText(x, rb);
 }

 template <>
-inline void parseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline void parseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    DayNum tmp(0);
    readDateText(tmp, rb, *time_zone);
@ -1054,7 +1062,7 @@ inline void parseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer & rb
 }

 template <>
-inline void parseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline void parseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    ExtendedDayNum tmp(0);
    readDateText(tmp, rb, *time_zone);
@ -1064,7 +1072,7 @@ inline void parseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuffer

 // NOTE: no need of extra overload of DateTime64, since readDateTimeText64 has different signature and that case is explicitly handled in the calling code.
 template <>
-inline void parseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline void parseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    time_t time = 0;
    readDateTimeText(time, rb, *time_zone);
@ -1072,7 +1080,7 @@ inline void parseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, ReadBuf
 }

 template <>
-inline void parseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline void parseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    UUID tmp;
    readUUIDText(tmp, rb);
@ -1080,7 +1088,7 @@ inline void parseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer & rb
 }

 template <>
-inline void parseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline void parseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    IPv4 tmp;
    readIPv4Text(tmp, rb);
@ -1088,7 +1096,7 @@ inline void parseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer & rb
 }

 template <>
-inline void parseImpl<DataTypeIPv6>(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline void parseImpl<DataTypeIPv6>(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    IPv6 tmp;
    readIPv6Text(tmp, rb);
@ -1096,16 +1104,21 @@ inline void parseImpl<DataTypeIPv6>(DataTypeIPv6::FieldType & x, ReadBuffer & rb
 }

 template <typename DataType>
-bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing)
 {
    if constexpr (std::is_floating_point_v<typename DataType::FieldType>)
-        return tryReadFloatText(x, rb);
+    {
+        if (precise_float_parsing)
+            return tryReadFloatTextPrecise(x, rb);
+        else
+            return tryReadFloatTextFast(x, rb);
+    }
    else /*if constexpr (is_integer_v<typename DataType::FieldType>)*/
        return tryReadIntText(x, rb);
 }

 template <>
-inline bool tryParseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline bool tryParseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    DayNum tmp(0);
    if (!tryReadDateText(tmp, rb, *time_zone))
@ -1115,7 +1128,7 @@ inline bool tryParseImpl<DataTypeDate>(DataTypeDate::FieldType & x, ReadBuffer &
 }

 template <>
-inline bool tryParseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline bool tryParseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    ExtendedDayNum tmp(0);
    if (!tryReadDateText(tmp, rb, *time_zone))
@ -1125,7 +1138,7 @@ inline bool tryParseImpl<DataTypeDate32>(DataTypeDate32::FieldType & x, ReadBuff
 }

 template <>
-inline bool tryParseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone)
+inline bool tryParseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool)
 {
    time_t tmp = 0;
    if (!tryReadDateTimeText(tmp, rb, *time_zone))
@ -1135,7 +1148,7 @@ inline bool tryParseImpl<DataTypeDateTime>(DataTypeDateTime::FieldType & x, Read
 }

 template <>
-inline bool tryParseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline bool tryParseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    UUID tmp;
    if (!tryReadUUIDText(tmp, rb))
@ -1146,7 +1159,7 @@ inline bool tryParseImpl<DataTypeUUID>(DataTypeUUID::FieldType & x, ReadBuffer &
 }

 template <>
-inline bool tryParseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline bool tryParseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    IPv4 tmp;
    if (!tryReadIPv4Text(tmp, rb))
@ -1157,7 +1170,7 @@ inline bool tryParseImpl<DataTypeIPv4>(DataTypeIPv4::FieldType & x, ReadBuffer &
 }

 template <>
-inline bool tryParseImpl<DataTypeIPv6>(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *)
+inline bool tryParseImpl<DataTypeIPv6>(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool)
 {
    IPv6 tmp;
    if (!tryReadIPv6Text(tmp, rb))
@ -1336,6 +1349,16 @@ struct ConvertThroughParsing

        size_t current_offset = 0;

+        bool precise_float_parsing = false;
+
+        if (DB::CurrentThread::isInitialized())
+        {
+            const DB::ContextPtr query_context = DB::CurrentThread::get().getQueryContext();
+
+            if (query_context)
+                precise_float_parsing = query_context->getSettingsRef().precise_float_parsing;
+        }
+
        for (size_t i = 0; i < size; ++i)
        {
            size_t next_offset = std::is_same_v<FromDataType, DataTypeString> ? (*offsets)[i] : (current_offset + fixed_string_size);
@ -1402,7 +1425,7 @@ struct ConvertThroughParsing
                                }
                            }

-                            parseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
+                            parseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone, precise_float_parsing);
                        } while (false);
                    }
                }
@ -1472,7 +1495,7 @@ struct ConvertThroughParsing
                                }
                            }

-                            parsed = tryParseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
+                            parsed = tryParseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone, precise_float_parsing);
                        } while (false);
                    }
                }
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -529,6 +529,11 @@ void tryReadIntTextUnsafe(T & x, ReadBuffer & buf)
 template <typename T> void readFloatText(T & x, ReadBuffer & in);
 template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in);

+template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in);
+template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in);
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in);
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in);
+

 /// simple: all until '\n' or '\t'
 void readString(String & s, ReadBuffer & buf);
--- a/tests/ci/docker_test.py
+++ b/tests/ci/docker_test.py
@ -18,7 +18,7 @@ import docker_server as ds

 class TestDockerImageCheck(unittest.TestCase):
    docker_images_path = os.path.join(
-        os.path.dirname(__file__), "tests/docker_images.json"
+        os.path.dirname(__file__), "tests/docker_images_for_tests.json"
    )

    def test_get_changed_docker_images(self):
--- a/tests/ci/tests/docker_images_for_tests.json
+++ b/tests/ci/tests/docker_images_for_tests.json
--- a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py
+++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py
@ -43,6 +43,7 @@ def get_fake_zk(node):
    return ku.get_fake_zk(cluster, node)


+@pytest.mark.skip(reason="test is flaky because changes are not properly waited for")
 def test_reconfig_replace_leader_in_one_command(started_cluster):
    """
    Remove leader from a cluster of 3 and add a new node to this cluster in a single command
--- a/tests/queries/0_stateless/02751_match_constant_needle.reference
+++ b/tests/queries/0_stateless/02751_match_constant_needle.reference
@ -1 +1,2 @@
 1
+1
--- a/tests/queries/0_stateless/02751_match_constant_needle.sql
+++ b/tests/queries/0_stateless/02751_match_constant_needle.sql
@ -1 +1,2 @@
 select match('default/k8s1', '\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z');
+select match('abc123', '[a-zA-Z]+(?P<num>\\d+)');
--- a/tests/queries/0_stateless/02813_float_parsing.reference
+++ b/tests/queries/0_stateless/02813_float_parsing.reference
@ -0,0 +1,2 @@
+1.7090999999999998	15008753.000000002	6.000000000000001e-9	6.000000000000002e-9	1.7091	15008752	5.9999996e-9	5.9999996e-9
+1.7091	15008753	6e-9	6.000000000000001e-9	1.7091	15008753	6e-9	6e-9
--- a/tests/queries/0_stateless/02813_float_parsing.sql
+++ b/tests/queries/0_stateless/02813_float_parsing.sql
@ -0,0 +1,21 @@
+SELECT
+    toFloat64('1.7091'),
+    toFloat64('1.5008753E7'),
+    toFloat64('6e-09'),
+    toFloat64('6.000000000000001e-9'),
+    toFloat32('1.7091'),
+    toFloat32('1.5008753E7'),
+    toFloat32('6e-09'),
+    toFloat32('6.000000000000001e-9')
+SETTINGS precise_float_parsing = 0;
+
+SELECT
+    toFloat64('1.7091'),
+    toFloat64('1.5008753E7'),
+    toFloat64('6e-09'),
+    toFloat64('6.000000000000001e-9'),
+    toFloat32('1.7091'),
+    toFloat32('1.5008753E7'),
+    toFloat32('6e-09'),
+    toFloat32('6.000000000000001e-9')
+SETTINGS precise_float_parsing = 1;