Merge pull request #48294 from ClickHouse/update-arrow-2

Try to update arrow library to release 11.0.0
This commit is contained in:
robot-clickhouse-ci-1 2023-04-05 04:43:51 +02:00 committed by GitHub
commit fb3af065f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 162 additions and 31 deletions

View File

@ -0,0 +1,81 @@
/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#include "libm.h"
static const float
half[2] = {0.5,-0.5},
ln2hi = 6.9314575195e-1f, /* 0x3f317200 */
ln2lo = 1.4286067653e-6f, /* 0x35bfbe8e */
invln2 = 1.4426950216e+0f, /* 0x3fb8aa3b */
/*
* Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]:
* |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74
*/
P1 = 1.6666625440e-1f, /* 0xaaaa8f.0p-26 */
P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */
float expf(float x)
{
float_t hi, lo, c, xx, y;
int k, sign;
uint32_t hx;
GET_FLOAT_WORD(hx, x);
sign = hx >> 31; /* sign bit of x */
hx &= 0x7fffffff; /* high word of |x| */
/* special cases */
if (hx >= 0x42aeac50) { /* if |x| >= -87.33655f or NaN */
if (hx >= 0x42b17218 && !sign) { /* x >= 88.722839f */
/* overflow */
x *= 0x1p127f;
return x;
}
if (sign) {
/* underflow */
FORCE_EVAL(-0x1p-149f/x);
if (hx >= 0x42cff1b5) /* x <= -103.972084f */
return 0;
}
}
/* argument reduction */
if (hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
if (hx > 0x3f851592) /* if |x| > 1.5 ln2 */
k = invln2*x + half[sign];
else
k = 1 - sign - sign;
hi = x - k*ln2hi; /* k*ln2hi is exact here */
lo = k*ln2lo;
x = hi - lo;
} else if (hx > 0x39000000) { /* |x| > 2**-14 */
k = 0;
hi = x;
lo = 0;
} else {
/* raise inexact */
FORCE_EVAL(0x1p127f + x);
return 1 + x;
}
/* x is now in primary range */
xx = x*x;
c = x - xx*(P1+xx*P2);
y = 1 + (x*c/(2-c) - lo + hi);
if (k == 0)
return y;
return scalbnf(y, k);
}

View File

@ -0,0 +1,31 @@
#include <math.h>
#include <stdint.h>
float scalbnf(float x, int n)
{
union {float f; uint32_t i;} u;
float_t y = x;
if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127)
n = 127;
}
} else if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126)
n = -126;
}
}
u.i = (uint32_t)(0x7f+n)<<23;
x = y * u.f;
return x;
}

2
contrib/arrow vendored

@ -1 +1 @@
Subproject commit d03245f801f798c63ee9a7d2b8914a9e5c5cd666
Subproject commit 1f1b3d35fb6eb73e6492d3afd8a85cde848d174f

View File

@ -202,6 +202,7 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/builder.cc"
"${LIBRARY_DIR}/buffer.cc"
"${LIBRARY_DIR}/chunked_array.cc"
"${LIBRARY_DIR}/chunk_resolver.cc"
"${LIBRARY_DIR}/compare.cc"
"${LIBRARY_DIR}/config.cc"
"${LIBRARY_DIR}/datum.cc"
@ -268,6 +269,10 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/util/uri.cc"
"${LIBRARY_DIR}/util/utf8.cc"
"${LIBRARY_DIR}/util/value_parsing.cc"
"${LIBRARY_DIR}/util/byte_size.cc"
"${LIBRARY_DIR}/util/debug.cc"
"${LIBRARY_DIR}/util/tracing.cc"
"${LIBRARY_DIR}/util/atfork_internal.cc"
"${LIBRARY_DIR}/vendored/base64.cpp"
"${LIBRARY_DIR}/vendored/datetime/tz.cpp"
@ -301,9 +306,11 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/source_node.cc"
"${LIBRARY_DIR}/compute/exec/sink_node.cc"
"${LIBRARY_DIR}/compute/exec/order_by_impl.cc"
"${LIBRARY_DIR}/compute/exec/partition_util.cc"
"${LIBRARY_DIR}/compute/function.cc"
"${LIBRARY_DIR}/compute/function_internal.cc"
"${LIBRARY_DIR}/compute/kernel.cc"
"${LIBRARY_DIR}/compute/light_array.cc"
"${LIBRARY_DIR}/compute/registry.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc"
@ -317,21 +324,28 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_dictionary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_extension.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_compare.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_random.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_round.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_validity.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_ascii.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_utf8.cc"
"${LIBRARY_DIR}/compute/kernels/util_internal.cc"
"${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc"
"${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
"${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
"${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
"${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
"${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
"${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
"${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
@ -340,13 +354,15 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/union_node.cc"
"${LIBRARY_DIR}/compute/exec/key_hash.cc"
"${LIBRARY_DIR}/compute/exec/key_map.cc"
"${LIBRARY_DIR}/compute/exec/key_compare.cc"
"${LIBRARY_DIR}/compute/exec/key_encode.cc"
"${LIBRARY_DIR}/compute/exec/util.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_dict.cc"
"${LIBRARY_DIR}/compute/exec/hash_join.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_node.cc"
"${LIBRARY_DIR}/compute/exec/task_util.cc"
"${LIBRARY_DIR}/compute/row/encode_internal.cc"
"${LIBRARY_DIR}/compute/row/grouper.cc"
"${LIBRARY_DIR}/compute/row/compare_internal.cc"
"${LIBRARY_DIR}/compute/row/row_internal.cc"
"${LIBRARY_DIR}/ipc/dictionary.cc"
"${LIBRARY_DIR}/ipc/feather.cc"
@ -357,7 +373,8 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/ipc/writer.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/options.cc"
)
add_definitions(-DARROW_WITH_LZ4)

View File

@ -21,9 +21,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/// For ORC format, index_nested_type = true, a nested type takes one index count. And the
/// the start index for ORC format should be 1, since index 0 indicates to select all columns.
template<bool index_nested_type>
class ArrowFieldIndexUtil
{
public:
@ -46,9 +44,7 @@ public:
calculateFieldIndices(const arrow::Schema & schema)
{
std::unordered_map<std::string, std::pair<int, int>> result;
// For format like ORC, index = 0 indicates to select all columns, so we skip 0 and start
// from 1.
int index_start = index_nested_type;
int index_start = 0;
for (int i = 0; i < schema.num_fields(); ++i)
{
const auto & field = schema.field(i);
@ -94,17 +90,16 @@ public:
}
/// Count the number of indices for types.
/// For orc format, index_nested_type is true, a complex type takes one index.
size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
{
if (type->id() == arrow::Type::LIST)
{
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + index_nested_type;
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type());
}
if (type->id() == arrow::Type::STRUCT)
{
int indices = index_nested_type;
int indices = 0;
auto * struct_type = static_cast<arrow::StructType *>(type.get());
for (int i = 0; i != struct_type->num_fields(); ++i)
indices += countIndicesForType(struct_type->field(i)->type());
@ -114,7 +109,7 @@ public:
if (type->id() == arrow::Type::MAP)
{
auto * map_type = static_cast<arrow::MapType *>(type.get());
return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) + index_nested_type;
return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) ;
}
return 1;
@ -144,8 +139,6 @@ private:
index_info.first = current_start_index;
if (field_type->id() == arrow::Type::STRUCT)
{
current_start_index += index_nested_type;
auto * struct_type = static_cast<arrow::StructType *>(field_type.get());
for (int i = 0, n = struct_type->num_fields(); i < n; ++i)
{
@ -161,7 +154,6 @@ private:
const auto * list_type = static_cast<arrow::ListType *>(field_type.get());
const auto value_field = list_type->value_field();
auto index_snapshot = current_start_index;
current_start_index += index_nested_type;
calculateFieldIndices(*value_field, field_name, current_start_index, result, name_prefix);
// The nested struct field has the same name as this list field.
// rewrite it back to the original value.

View File

@ -129,10 +129,17 @@ void ORCBlockInputFormat::prepareReader()
format_settings.null_as_default,
format_settings.orc.case_insensitive_column_matching);
ArrowFieldIndexUtil<true> field_util(
format_settings.orc.case_insensitive_column_matching,
format_settings.orc.allow_missing_columns);
include_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
std::unordered_set<String> nested_table_names;
if (format_settings.orc.import_nested)
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);
for (int i = 0; i < schema->num_fields(); ++i)
{
const auto & name = schema->field(i)->name();
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
include_indices.push_back(i);
}
}
ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)

View File

@ -132,7 +132,7 @@ void ParquetBlockInputFormat::prepareReader()
format_settings.null_as_default,
format_settings.parquet.case_insensitive_column_matching);
ArrowFieldIndexUtil<false> field_util(
ArrowFieldIndexUtil field_util(
format_settings.parquet.case_insensitive_column_matching,
format_settings.parquet.allow_missing_columns);
column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);

View File

@ -95,14 +95,14 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
builder.version(getParquetVersion(format_settings));
builder.compression(getParquetCompression(format_settings.parquet.output_compression_method));
auto props = builder.build();
auto status = parquet::arrow::FileWriter::Open(
auto result = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props, /*parquet::default_writer_properties(),*/
&file_writer);
if (!status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", status.ToString());
props);
if (!result.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", result.status().ToString());
file_writer = std::move(result.ValueOrDie());
}
// TODO: calculate row_group_size depending on a number of rows and table size

View File

@ -92,8 +92,11 @@ idx10 ['This','is','a','test']
123 1
456 2
=== Try load data from datapage_v2.snappy.parquet
Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA)
abc 1 2 1 [1,2,3]
abc 2 3 1 []
abc 3 4 1 []
\N 4 5 0 [1,2,3]
abc 5 2 1 [1,2]
=== Try load data from datatype-date32.parquet
1925-01-01
1949-10-01

View File

@ -1 +1 @@
`a` Nullable(String), `b` Array(Nullable(Int32)), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))