Merge pull request #48294 from ClickHouse/update-arrow-2

Try to update arrow library to release 11.0.0
2024-11-24 00:22:29 +00:00 · 2023-04-05 04:43:51 +02:00 · 2023-04-05 04:43:51 +02:00 · fb3af065f4
commit fb3af065f4
parent 25be18c83d 0fc76b00be
10 changed files with 162 additions and 31 deletions
--- a/base/glibc-compatibility/musl/expf.c
+++ b/base/glibc-compatibility/musl/expf.c
@ -0,0 +1,81 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#include "libm.h"
+
+static const float
+    half[2] = {0.5,-0.5},
+    ln2hi   = 6.9314575195e-1f,  /* 0x3f317200 */
+    ln2lo   = 1.4286067653e-6f,  /* 0x35bfbe8e */
+    invln2  = 1.4426950216e+0f,  /* 0x3fb8aa3b */
+    /*
+ * Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]:
+ * |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74
+ */
+    P1 =  1.6666625440e-1f, /*  0xaaaa8f.0p-26 */
+    P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */
+
+float expf(float x)
+{
+    float_t hi, lo, c, xx, y;
+    int k, sign;
+    uint32_t hx;
+
+    GET_FLOAT_WORD(hx, x);
+    sign = hx >> 31;   /* sign bit of x */
+    hx &= 0x7fffffff;  /* high word of |x| */
+
+    /* special cases */
+    if (hx >= 0x42aeac50) {  /* if |x| >= -87.33655f or NaN */
+        if (hx >= 0x42b17218 && !sign) {  /* x >= 88.722839f */
+            /* overflow */
+            x *= 0x1p127f;
+            return x;
+        }
+        if (sign) {
+            /* underflow */
+            FORCE_EVAL(-0x1p-149f/x);
+            if (hx >= 0x42cff1b5)  /* x <= -103.972084f */
+                return 0;
+        }
+    }
+
+    /* argument reduction */
+    if (hx > 0x3eb17218) {  /* if |x| > 0.5 ln2 */
+        if (hx > 0x3f851592)  /* if |x| > 1.5 ln2 */
+            k = invln2*x + half[sign];
+        else
+            k = 1 - sign - sign;
+        hi = x - k*ln2hi;  /* k*ln2hi is exact here */
+        lo = k*ln2lo;
+        x = hi - lo;
+    } else if (hx > 0x39000000) {  /* |x| > 2**-14 */
+        k = 0;
+        hi = x;
+        lo = 0;
+    } else {
+        /* raise inexact */
+        FORCE_EVAL(0x1p127f + x);
+        return 1 + x;
+    }
+
+    /* x is now in primary range */
+    xx = x*x;
+    c = x - xx*(P1+xx*P2);
+    y = 1 + (x*c/(2-c) - lo + hi);
+    if (k == 0)
+        return y;
+    return scalbnf(y, k);
+}
--- a/base/glibc-compatibility/musl/scalbnf.c
+++ b/base/glibc-compatibility/musl/scalbnf.c
@ -0,0 +1,31 @@
+#include <math.h>
+#include <stdint.h>
+
+float scalbnf(float x, int n)
+{
+    union {float f; uint32_t i;} u;
+    float_t y = x;
+
+    if (n > 127) {
+        y *= 0x1p127f;
+        n -= 127;
+        if (n > 127) {
+            y *= 0x1p127f;
+            n -= 127;
+            if (n > 127)
+                n = 127;
+        }
+    } else if (n < -126) {
+        y *= 0x1p-126f;
+        n += 126;
+        if (n < -126) {
+            y *= 0x1p-126f;
+            n += 126;
+            if (n < -126)
+                n = -126;
+        }
+    }
+    u.i = (uint32_t)(0x7f+n)<<23;
+    x = y * u.f;
+    return x;
+}
--- a/contrib/arrow
+++ b/contrib/arrow
@ -1 +1 @@
-Subproject commit d03245f801f798c63ee9a7d2b8914a9e5c5cd666
+Subproject commit 1f1b3d35fb6eb73e6492d3afd8a85cde848d174f
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -202,6 +202,7 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/builder.cc"
        "${LIBRARY_DIR}/buffer.cc"
        "${LIBRARY_DIR}/chunked_array.cc"
+        "${LIBRARY_DIR}/chunk_resolver.cc"
        "${LIBRARY_DIR}/compare.cc"
        "${LIBRARY_DIR}/config.cc"
        "${LIBRARY_DIR}/datum.cc"
@ -268,6 +269,10 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/util/uri.cc"
        "${LIBRARY_DIR}/util/utf8.cc"
        "${LIBRARY_DIR}/util/value_parsing.cc"
+        "${LIBRARY_DIR}/util/byte_size.cc"
+        "${LIBRARY_DIR}/util/debug.cc"
+        "${LIBRARY_DIR}/util/tracing.cc"
+        "${LIBRARY_DIR}/util/atfork_internal.cc"
        "${LIBRARY_DIR}/vendored/base64.cpp"
        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"

@ -301,9 +306,11 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/exec/source_node.cc"
        "${LIBRARY_DIR}/compute/exec/sink_node.cc"
        "${LIBRARY_DIR}/compute/exec/order_by_impl.cc"
+        "${LIBRARY_DIR}/compute/exec/partition_util.cc"
        "${LIBRARY_DIR}/compute/function.cc"
        "${LIBRARY_DIR}/compute/function_internal.cc"
        "${LIBRARY_DIR}/compute/kernel.cc"
+        "${LIBRARY_DIR}/compute/light_array.cc"
        "${LIBRARY_DIR}/compute/registry.cc"
        "${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc"
        "${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc"
@ -317,21 +324,28 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_dictionary.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc"
+        "${LIBRARY_DIR}/compute/kernels/scalar_cast_extension.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_compare.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_nested.cc"
+        "${LIBRARY_DIR}/compute/kernels/scalar_random.cc"
+        "${LIBRARY_DIR}/compute/kernels/scalar_round.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc"
-        "${LIBRARY_DIR}/compute/kernels/scalar_string.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_validity.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc"
+        "${LIBRARY_DIR}/compute/kernels/scalar_string_ascii.cc"
+        "${LIBRARY_DIR}/compute/kernels/scalar_string_utf8.cc"
        "${LIBRARY_DIR}/compute/kernels/util_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
@ -340,13 +354,15 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/exec/union_node.cc"
        "${LIBRARY_DIR}/compute/exec/key_hash.cc"
        "${LIBRARY_DIR}/compute/exec/key_map.cc"
-        "${LIBRARY_DIR}/compute/exec/key_compare.cc"
-        "${LIBRARY_DIR}/compute/exec/key_encode.cc"
        "${LIBRARY_DIR}/compute/exec/util.cc"
        "${LIBRARY_DIR}/compute/exec/hash_join_dict.cc"
        "${LIBRARY_DIR}/compute/exec/hash_join.cc"
        "${LIBRARY_DIR}/compute/exec/hash_join_node.cc"
        "${LIBRARY_DIR}/compute/exec/task_util.cc"
+        "${LIBRARY_DIR}/compute/row/encode_internal.cc"
+        "${LIBRARY_DIR}/compute/row/grouper.cc"
+        "${LIBRARY_DIR}/compute/row/compare_internal.cc"
+        "${LIBRARY_DIR}/compute/row/row_internal.cc"

        "${LIBRARY_DIR}/ipc/dictionary.cc"
        "${LIBRARY_DIR}/ipc/feather.cc"
@ -357,7 +373,8 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/ipc/writer.cc"

        "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
-        "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc"
+        "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
+        "${ARROW_SRC_DIR}/arrow/adapters/orc/options.cc"
        )

 add_definitions(-DARROW_WITH_LZ4)
--- a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
+++ b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
@ -21,9 +21,7 @@ namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
 }
-/// For ORC format, index_nested_type = true, a nested type takes one index count. And the
-/// the start index for ORC format should be 1, since index 0 indicates to select all columns.
-template<bool index_nested_type>
+
 class ArrowFieldIndexUtil
 {
 public:
@ -46,9 +44,7 @@ public:
        calculateFieldIndices(const arrow::Schema & schema)
    {
        std::unordered_map<std::string, std::pair<int, int>> result;
-        // For format like ORC, index = 0 indicates to select all columns, so we skip 0 and start
-        // from 1.
-        int index_start = index_nested_type;
+        int index_start = 0;
        for (int i = 0; i < schema.num_fields(); ++i)
        {
            const auto & field = schema.field(i);
@ -94,17 +90,16 @@ public:
    }

    /// Count the number of indices for types.
-    /// For orc format, index_nested_type is true, a complex type takes one index.
    size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
    {
        if (type->id() == arrow::Type::LIST)
        {
-            return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + index_nested_type;
+            return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type());
        }

        if (type->id() == arrow::Type::STRUCT)
        {
-            int indices = index_nested_type;
+            int indices = 0;
            auto * struct_type = static_cast<arrow::StructType *>(type.get());
            for (int i = 0; i != struct_type->num_fields(); ++i)
                indices += countIndicesForType(struct_type->field(i)->type());
@ -114,7 +109,7 @@ public:
        if (type->id() == arrow::Type::MAP)
        {
            auto * map_type = static_cast<arrow::MapType *>(type.get());
-            return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) + index_nested_type;
+            return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) ;
        }

        return 1;
@ -144,8 +139,6 @@ private:
        index_info.first = current_start_index;
        if (field_type->id() == arrow::Type::STRUCT)
        {
-            current_start_index += index_nested_type;
-
            auto * struct_type = static_cast<arrow::StructType *>(field_type.get());
            for (int i = 0, n = struct_type->num_fields(); i < n; ++i)
            {
@ -161,7 +154,6 @@ private:
            const auto * list_type = static_cast<arrow::ListType *>(field_type.get());
            const auto value_field = list_type->value_field();
            auto index_snapshot = current_start_index;
-            current_start_index += index_nested_type;
            calculateFieldIndices(*value_field, field_name, current_start_index, result, name_prefix);
            // The nested struct field has the same name as this list field.
            // rewrite it back to the original value.
--- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp
@ -129,10 +129,17 @@ void ORCBlockInputFormat::prepareReader()
        format_settings.null_as_default,
        format_settings.orc.case_insensitive_column_matching);

-    ArrowFieldIndexUtil<true> field_util(
-        format_settings.orc.case_insensitive_column_matching,
-        format_settings.orc.allow_missing_columns);
-    include_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
+    const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
+    std::unordered_set<String> nested_table_names;
+    if (format_settings.orc.import_nested)
+        nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
+
+    for (int i = 0; i < schema->num_fields(); ++i)
+    {
+        const auto & name = schema->field(i)->name();
+        if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
+            include_indices.push_back(i);
+    }
 }

 ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
@ -132,7 +132,7 @@ void ParquetBlockInputFormat::prepareReader()
        format_settings.null_as_default,
        format_settings.parquet.case_insensitive_column_matching);

-    ArrowFieldIndexUtil<false> field_util(
+    ArrowFieldIndexUtil field_util(
        format_settings.parquet.case_insensitive_column_matching,
        format_settings.parquet.allow_missing_columns);
    column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
--- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
@ -95,14 +95,14 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
        builder.version(getParquetVersion(format_settings));
        builder.compression(getParquetCompression(format_settings.parquet.output_compression_method));
        auto props = builder.build();
-        auto status = parquet::arrow::FileWriter::Open(
+        auto result = parquet::arrow::FileWriter::Open(
            *arrow_table->schema(),
            arrow::default_memory_pool(),
            sink,
-            props, /*parquet::default_writer_properties(),*/
-            &file_writer);
-        if (!status.ok())
-            throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", status.ToString());
+            props);
+        if (!result.ok())
+            throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", result.status().ToString());
+        file_writer = std::move(result.ValueOrDie());
    }

    // TODO: calculate row_group_size depending on a number of rows and table size
--- a/tests/queries/0_stateless/00900_long_parquet_load.reference
+++ b/tests/queries/0_stateless/00900_long_parquet_load.reference
@ -92,8 +92,11 @@ idx10	['This','is','a','test']
 123	1
 456	2
 === Try load data from datapage_v2.snappy.parquet
-Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA)
-
+abc	1	2	1	[1,2,3]
+abc	2	3	1	[]
+abc	3	4	1	[]
+\N	4	5	0	[1,2,3]
+abc	5	2	1	[1,2]
 === Try load data from datatype-date32.parquet
 1925-01-01
 1949-10-01
--- a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns
@ -1 +1 @@
-`a` Nullable(String), `b` Array(Nullable(Int32)), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))
+`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))