From 43735f3b85a60fc9438f92c3e32223f2498e6dc2 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Wed, 4 Nov 2020 13:59:23 +0300 Subject: [PATCH 01/21] Switch upstream repo for Arrow submodule --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index fdd48fcce01..8dc58eacf74 100644 --- a/.gitmodules +++ b/.gitmodules @@ -52,7 +52,7 @@ url = https://github.com/ClickHouse-Extras/Turbo-Base64.git [submodule "contrib/arrow"] path = contrib/arrow - url = https://github.com/apache/arrow + url = https://github.com/ClickHouse-Extras/arrow [submodule "contrib/thrift"] path = contrib/thrift url = https://github.com/apache/thrift.git From 19f81e0f8259c300fc9aa7e5b6e9e61105089045 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Wed, 4 Nov 2020 15:51:10 +0300 Subject: [PATCH 02/21] Update submodule sha --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index 3cbcb7b62c2..c7fe4e6b80a 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b +Subproject commit c7fe4e6b80a9d090c3ab25334b54b8cd42859321 From b0277e99a5886ae9bd0adb9a4555e182ca3cf753 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Wed, 4 Nov 2020 19:51:43 +0300 Subject: [PATCH 03/21] Update sources list --- contrib/arrow-cmake/CMakeLists.txt | 95 +++++++++++++------ .../arrow-cmake/cpp/src/arrow/util/config.h | 11 +++ 2 files changed, 77 insertions(+), 29 deletions(-) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 442f2534f6a..70c966bf573 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -148,11 +148,12 @@ configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR} # arrow/cpp/src/arrow/CMakeLists.txt set(ARROW_SRCS - ${LIBRARY_DIR}/array.cc ${LIBRARY_DIR}/buffer.cc - ${LIBRARY_DIR}/device.cc ${LIBRARY_DIR}/builder.cc + ${LIBRARY_DIR}/chunked_array.cc ${LIBRARY_DIR}/compare.cc + ${LIBRARY_DIR}/datum.cc + ${LIBRARY_DIR}/device.cc ${LIBRARY_DIR}/extension_type.cc ${LIBRARY_DIR}/memory_pool.cc ${LIBRARY_DIR}/pretty_print.cc @@ -167,11 +168,12 @@ set(ARROW_SRCS ${LIBRARY_DIR}/type.cc ${LIBRARY_DIR}/visitor.cc - ${LIBRARY_DIR}/tensor/coo_converter.cc - ${LIBRARY_DIR}/tensor/csc_converter.cc - ${LIBRARY_DIR}/tensor/csf_converter.cc - ${LIBRARY_DIR}/tensor/csr_converter.cc - + ${LIBRARY_DIR}/array/array_base.cc + ${LIBRARY_DIR}/array/array_binary.cc + ${LIBRARY_DIR}/array/array_decimal.cc + ${LIBRARY_DIR}/array/array_dict.cc + ${LIBRARY_DIR}/array/array_nested.cc + ${LIBRARY_DIR}/array/array_primitive.cc ${LIBRARY_DIR}/array/builder_adaptive.cc ${LIBRARY_DIR}/array/builder_base.cc ${LIBRARY_DIR}/array/builder_binary.cc @@ -181,17 +183,48 @@ set(ARROW_SRCS ${LIBRARY_DIR}/array/builder_primitive.cc ${LIBRARY_DIR}/array/builder_union.cc ${LIBRARY_DIR}/array/concatenate.cc - ${LIBRARY_DIR}/array/dict_internal.cc + ${LIBRARY_DIR}/array/data.cc ${LIBRARY_DIR}/array/diff.cc + ${LIBRARY_DIR}/array/util.cc ${LIBRARY_DIR}/array/validate.cc - ${LIBRARY_DIR}/csv/converter.cc + ${LIBRARY_DIR}/compute/api_scalar.cc + ${LIBRARY_DIR}/compute/api_vector.cc + ${LIBRARY_DIR}/compute/cast.cc + ${LIBRARY_DIR}/compute/exec.cc + ${LIBRARY_DIR}/compute/function.cc + ${LIBRARY_DIR}/compute/kernel.cc + ${LIBRARY_DIR}/compute/registry.cc + + ${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc + ${LIBRARY_DIR}/compute/kernels/codegen_internal.cc + ${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc + ${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc + ${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc + ${LIBRARY_DIR}/compute/kernels/scalar_compare.cc + ${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc + ${LIBRARY_DIR}/compute/kernels/scalar_nested.cc + ${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc + ${LIBRARY_DIR}/compute/kernels/scalar_string.cc + ${LIBRARY_DIR}/compute/kernels/scalar_validity.cc + ${LIBRARY_DIR}/compute/kernels/vector_hash.cc + ${LIBRARY_DIR}/compute/kernels/vector_nested.cc + ${LIBRARY_DIR}/compute/kernels/vector_selection.cc + ${LIBRARY_DIR}/compute/kernels/vector_sort.cc + ${LIBRARY_DIR}/compute/kernels/util_internal.cc + ${LIBRARY_DIR}/csv/chunker.cc ${LIBRARY_DIR}/csv/column_builder.cc + ${LIBRARY_DIR}/csv/column_decoder.cc + ${LIBRARY_DIR}/csv/converter.cc ${LIBRARY_DIR}/csv/options.cc ${LIBRARY_DIR}/csv/parser.cc ${LIBRARY_DIR}/csv/reader.cc - ${LIBRARY_DIR}/csv/column_decoder.cc ${LIBRARY_DIR}/ipc/dictionary.cc ${LIBRARY_DIR}/ipc/feather.cc @@ -202,14 +235,24 @@ set(ARROW_SRCS ${LIBRARY_DIR}/ipc/writer.cc ${LIBRARY_DIR}/io/buffered.cc + ${LIBRARY_DIR}/io/caching.cc ${LIBRARY_DIR}/io/compressed.cc ${LIBRARY_DIR}/io/file.cc ${LIBRARY_DIR}/io/interfaces.cc ${LIBRARY_DIR}/io/memory.cc ${LIBRARY_DIR}/io/slow.cc + ${LIBRARY_DIR}/tensor/coo_converter.cc + ${LIBRARY_DIR}/tensor/csf_converter.cc + ${LIBRARY_DIR}/tensor/csx_converter.cc + ${LIBRARY_DIR}/util/basic_decimal.cc + ${LIBRARY_DIR}/util/bit_block_counter.cc + ${LIBRARY_DIR}/util/bit_run_reader.cc ${LIBRARY_DIR}/util/bit_util.cc + ${LIBRARY_DIR}/util/bitmap.cc + ${LIBRARY_DIR}/util/bitmap_builders.cc + ${LIBRARY_DIR}/util/bitmap_ops.cc ${LIBRARY_DIR}/util/compression.cc ${LIBRARY_DIR}/util/compression_lz4.cc ${LIBRARY_DIR}/util/compression_snappy.cc @@ -217,8 +260,12 @@ set(ARROW_SRCS ${LIBRARY_DIR}/util/compression_zstd.cc ${LIBRARY_DIR}/util/cpu_info.cc ${LIBRARY_DIR}/util/decimal.cc + ${LIBRARY_DIR}/util/delimiting.cc + ${LIBRARY_DIR}/util/formatting.cc + ${LIBRARY_DIR}/util/future.cc ${LIBRARY_DIR}/util/int_util.cc ${LIBRARY_DIR}/util/io_util.cc + ${LIBRARY_DIR}/util/iterator.cc ${LIBRARY_DIR}/util/key_value_metadata.cc ${LIBRARY_DIR}/util/logging.cc ${LIBRARY_DIR}/util/memory.cc @@ -226,27 +273,15 @@ set(ARROW_SRCS ${LIBRARY_DIR}/util/string.cc ${LIBRARY_DIR}/util/task_group.cc ${LIBRARY_DIR}/util/thread_pool.cc + ${LIBRARY_DIR}/util/time.cc ${LIBRARY_DIR}/util/trie.cc ${LIBRARY_DIR}/util/utf8.cc - ${LIBRARY_DIR}/util/future.cc - ${LIBRARY_DIR}/util/formatting.cc - ${LIBRARY_DIR}/util/parsing.cc - ${LIBRARY_DIR}/util/time.cc - ${LIBRARY_DIR}/util/delimiting.cc - ${LIBRARY_DIR}/util/iterator.cc + ${LIBRARY_DIR}/util/value_parsing.cc ${LIBRARY_DIR}/vendored/base64.cpp ${ORC_SRCS} ) -set(ARROW_SRCS ${ARROW_SRCS} - ${LIBRARY_DIR}/compute/context.cc - ${LIBRARY_DIR}/compute/kernels/boolean.cc - ${LIBRARY_DIR}/compute/kernels/cast.cc - ${LIBRARY_DIR}/compute/kernels/hash.cc - ${LIBRARY_DIR}/compute/kernels/util_internal.cc - ) - if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY) set(ARROW_WITH_SNAPPY 1) endif () @@ -319,19 +354,25 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet) set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated) # arrow/cpp/src/parquet/CMakeLists.txt set(PARQUET_SRCS + ${LIBRARY_DIR}/arrow/path_internal.cc ${LIBRARY_DIR}/arrow/reader.cc ${LIBRARY_DIR}/arrow/reader_internal.cc ${LIBRARY_DIR}/arrow/schema.cc + ${LIBRARY_DIR}/arrow/schema_internal.cc ${LIBRARY_DIR}/arrow/writer.cc - ${LIBRARY_DIR}/arrow/path_internal.cc ${LIBRARY_DIR}/bloom_filter.cc ${LIBRARY_DIR}/column_reader.cc ${LIBRARY_DIR}/column_scanner.cc ${LIBRARY_DIR}/column_writer.cc ${LIBRARY_DIR}/deprecated_io.cc ${LIBRARY_DIR}/encoding.cc + ${LIBRARY_DIR}/encryption.cc + ${LIBRARY_DIR}/encryption_internal.cc ${LIBRARY_DIR}/file_reader.cc ${LIBRARY_DIR}/file_writer.cc + ${LIBRARY_DIR}/internal_file_decryptor.cc + ${LIBRARY_DIR}/internal_file_encryptor.cc + ${LIBRARY_DIR}/level_conversion.cc ${LIBRARY_DIR}/metadata.cc ${LIBRARY_DIR}/murmur3.cc ${LIBRARY_DIR}/platform.cc @@ -340,10 +381,6 @@ set(PARQUET_SRCS ${LIBRARY_DIR}/schema.cc ${LIBRARY_DIR}/statistics.cc ${LIBRARY_DIR}/types.cc - ${LIBRARY_DIR}/encryption.cc - ${LIBRARY_DIR}/encryption_internal.cc - ${LIBRARY_DIR}/internal_file_decryptor.cc - ${LIBRARY_DIR}/internal_file_encryptor.cc ${GEN_LIBRARY_DIR}/parquet_constants.cpp ${GEN_LIBRARY_DIR}/parquet_types.cpp diff --git a/contrib/arrow-cmake/cpp/src/arrow/util/config.h b/contrib/arrow-cmake/cpp/src/arrow/util/config.h index bf8ea581922..71613baf5a8 100644 --- a/contrib/arrow-cmake/cpp/src/arrow/util/config.h +++ b/contrib/arrow-cmake/cpp/src/arrow/util/config.h @@ -20,7 +20,18 @@ #define ARROW_VERSION_PATCH #define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH +#define ARROW_VERSION_STRING "" + #define ARROW_SO_VERSION "" #define ARROW_FULL_SO_VERSION "" +#define ARROW_CXX_COMPILER_ID "" +#define ARROW_CXX_COMPILER_VERSION "" +#define ARROW_CXX_COMPILER_FLAGS "" + +#define ARROW_GIT_ID "" +#define ARROW_GIT_DESCRIPTION "" + +#define ARROW_PACKAGE_KIND "" + /* #undef GRPCPP_PP_INCLUDE */ From 4f82e95a460fa09458f072cff3b596187d35e301 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Thu, 5 Nov 2020 19:49:11 +0300 Subject: [PATCH 04/21] Update submodule to the latest release --- .gitmodules | 1 + contrib/arrow | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 8dc58eacf74..591139856bd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -53,6 +53,7 @@ [submodule "contrib/arrow"] path = contrib/arrow url = https://github.com/ClickHouse-Extras/arrow + branch = clickhouse-fix-ipv6 [submodule "contrib/thrift"] path = contrib/thrift url = https://github.com/apache/thrift.git diff --git a/contrib/arrow b/contrib/arrow index c7fe4e6b80a..b693865b56f 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit c7fe4e6b80a9d090c3ab25334b54b8cd42859321 +Subproject commit b693865b56fba31746278bb9c03bf0c9149fe19a From 069e3dec0dd65d17590429532b605303920010cc Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Thu, 5 Nov 2020 20:10:11 +0300 Subject: [PATCH 05/21] Do not use deprecated API --- contrib/arrow-cmake/cpp/src/arrow/util/config.h | 2 ++ src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/contrib/arrow-cmake/cpp/src/arrow/util/config.h b/contrib/arrow-cmake/cpp/src/arrow/util/config.h index 71613baf5a8..26206c3d562 100644 --- a/contrib/arrow-cmake/cpp/src/arrow/util/config.h +++ b/contrib/arrow-cmake/cpp/src/arrow/util/config.h @@ -34,4 +34,6 @@ #define ARROW_PACKAGE_KIND "" +/* #undef ARROW_S3 */ + /* #undef GRPCPP_PP_INCLUDE */ diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index 756172a5a68..c1abdd1a759 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -62,9 +62,9 @@ void ArrowBlockOutputFormat::prepareWriter(const std::shared_ptr // TODO: should we use arrow::ipc::IpcOptions::alignment? if (stream) - writer_status = arrow::ipc::NewStreamWriter(arrow_ostream.get(), schema); + writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema); else - writer_status = arrow::ipc::NewFileWriter(arrow_ostream.get(), schema); + writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema); if (!writer_status.ok()) throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, From 4598046176b5d3bf4218374b690adf7367a9f3a2 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:24:50 +0300 Subject: [PATCH 06/21] Update cmake --- contrib/arrow-cmake/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 70c966bf573..2394798ba8a 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -197,6 +197,8 @@ set(ARROW_SRCS ${LIBRARY_DIR}/compute/registry.cc ${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc + ${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc + ${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc ${LIBRARY_DIR}/compute/kernels/codegen_internal.cc ${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc ${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc @@ -253,6 +255,7 @@ set(ARROW_SRCS ${LIBRARY_DIR}/util/bitmap.cc ${LIBRARY_DIR}/util/bitmap_builders.cc ${LIBRARY_DIR}/util/bitmap_ops.cc + ${LIBRARY_DIR}/util/bpacking.cc ${LIBRARY_DIR}/util/compression.cc ${LIBRARY_DIR}/util/compression_lz4.cc ${LIBRARY_DIR}/util/compression_snappy.cc @@ -373,6 +376,7 @@ set(PARQUET_SRCS ${LIBRARY_DIR}/internal_file_decryptor.cc ${LIBRARY_DIR}/internal_file_encryptor.cc ${LIBRARY_DIR}/level_conversion.cc + ${LIBRARY_DIR}/level_comparison.cc ${LIBRARY_DIR}/metadata.cc ${LIBRARY_DIR}/murmur3.cc ${LIBRARY_DIR}/platform.cc From 066a3032e1476dad7ecd87982d31f398ada50a9b Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:25:30 +0300 Subject: [PATCH 07/21] Move arrow config from SOURCE_DIR to BINARY_DIR --- contrib/arrow-cmake/CMakeLists.txt | 5 ++- .../arrow-cmake/cpp/src/arrow/util/config.h | 39 ------------------- 2 files changed, 3 insertions(+), 41 deletions(-) delete mode 100644 contrib/arrow-cmake/cpp/src/arrow/util/config.h diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 2394798ba8a..0a18cebb6c4 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -144,7 +144,7 @@ set(ORC_SRCS set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow) -configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/arrow/util/config.h") +configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h") # arrow/cpp/src/arrow/CMakeLists.txt set(ARROW_SRCS @@ -327,7 +327,8 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY) add_dependencies(${ARROW_LIBRARY} protoc) endif () -target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src) +target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src) +target_include_directories(${ARROW_LIBRARY} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/cpp/src) target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY}) target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4) if (ARROW_WITH_SNAPPY) diff --git a/contrib/arrow-cmake/cpp/src/arrow/util/config.h b/contrib/arrow-cmake/cpp/src/arrow/util/config.h deleted file mode 100644 index 26206c3d562..00000000000 --- a/contrib/arrow-cmake/cpp/src/arrow/util/config.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#define ARROW_VERSION_MAJOR -#define ARROW_VERSION_MINOR -#define ARROW_VERSION_PATCH -#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH - -#define ARROW_VERSION_STRING "" - -#define ARROW_SO_VERSION "" -#define ARROW_FULL_SO_VERSION "" - -#define ARROW_CXX_COMPILER_ID "" -#define ARROW_CXX_COMPILER_VERSION "" -#define ARROW_CXX_COMPILER_FLAGS "" - -#define ARROW_GIT_ID "" -#define ARROW_GIT_DESCRIPTION "" - -#define ARROW_PACKAGE_KIND "" - -/* #undef ARROW_S3 */ - -/* #undef GRPCPP_PP_INCLUDE */ From 8fd417f9a34a16372872577c66dcd479597e071f Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:25:38 +0300 Subject: [PATCH 08/21] Fix build --- contrib/arrow-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index 0a18cebb6c4..4b402a9db79 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -328,7 +328,7 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY) endif () target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src) -target_include_directories(${ARROW_LIBRARY} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/cpp/src) +target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/cpp/src) target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY}) target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4) if (ARROW_WITH_SNAPPY) From 9be0340d98e956ee8cc3a1c3c53febf508533091 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:25:48 +0300 Subject: [PATCH 09/21] Regenerate parquet config --- contrib/arrow-cmake/cpp/src/parquet/parquet_version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h b/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h index 7404db1d381..ec9fb32b2a5 100644 --- a/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h +++ b/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h @@ -22,8 +22,8 @@ #define PARQUET_VERSION_MINOR 5 #define PARQUET_VERSION_PATCH 1 -#define PARQUET_SO_VERSION 0 -#define PARQUET_FULL_SO_VERSION 0.17 +#define PARQUET_SO_VERSION "200" +#define PARQUET_FULL_SO_VERSION "200.0.0" // define the parquet created by version #define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT" From ccbdecfb453e0f6c480f812b4e8df235859654d5 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:25:53 +0300 Subject: [PATCH 10/21] Rewrite and fix auxiliary script for parquet_load test --- .../00900_parquet_create_table_columns.pl | 54 ----------- .../00900_parquet_create_table_columns.py | 89 +++++++++++++++++++ 2 files changed, 89 insertions(+), 54 deletions(-) delete mode 100755 tests/queries/0_stateless/00900_parquet_create_table_columns.pl create mode 100755 tests/queries/0_stateless/00900_parquet_create_table_columns.py diff --git a/tests/queries/0_stateless/00900_parquet_create_table_columns.pl b/tests/queries/0_stateless/00900_parquet_create_table_columns.pl deleted file mode 100755 index baba1f63aee..00000000000 --- a/tests/queries/0_stateless/00900_parquet_create_table_columns.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env perl -package parquet_create_table_columns; -use strict; -no warnings 'experimental'; -use feature 'signatures'; -use JSON::XS; -#use Data::Dumper; - -sub file_read($file) { - open my $f, '<', $file or return; - local $/ = undef; - my $ret = <$f>; - close $f; - return $ret; -} - -our $type_parquet_logical_to_clickhouse = { - DECIMAL => 'Decimal128(1)', - TIMESTAMP_MICROS => 'DateTime', - TIMESTAMP_MILLIS => 'DateTime', -}; -our $type_parquet_physical_to_clickhouse = { - BOOLEAN => 'UInt8', - INT32 => 'Int32', - INT64 => 'Int64', - FLOAT => 'Float32', - DOUBLE => 'Float64', - BYTE_ARRAY => 'String', - FIXED_LEN_BYTE_ARRAY => 'String', # Maybe FixedString? - INT96 => 'Int64', # TODO! -}; - -sub columns ($json) { - my @list; - my %uniq; - for my $column (@{$json->{Columns}}) { - #warn Data::Dumper::Dumper $column; - my $name = $column->{'Name'}; - my $type = $type_parquet_logical_to_clickhouse->{$column->{'LogicalType'}} || $type_parquet_physical_to_clickhouse->{$column->{'PhysicalType'}}; - unless ($type) { - warn "Unknown type [$column->{'PhysicalType'}:$column->{'LogicalType'}] of column [$name]"; - } - $type = "Nullable($type)"; - $name .= $column->{'Id'} if $uniq{$name}++; # Names can be non-unique - push @list, {name => $name, type => $type}; - } - print join ', ', map {"`$_->{name}` $_->{type}"} @list; -} - -sub columns_file ($file) { - return columns(JSON::XS::decode_json(file_read($file))); -} - -columns_file(shift) unless caller; diff --git a/tests/queries/0_stateless/00900_parquet_create_table_columns.py b/tests/queries/0_stateless/00900_parquet_create_table_columns.py new file mode 100755 index 00000000000..44dc6d6df64 --- /dev/null +++ b/tests/queries/0_stateless/00900_parquet_create_table_columns.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import json +import sys + +TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE = { + "DECIMAL": "Decimal128(1)", + "TIMESTAMP_MICROS": "DateTime", + "TIMESTAMP_MILLIS": "DateTime", + "UTF8": "String", +} + +TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE = { + "BOOLEAN": "UInt8", + "INT32": "Int32", + "INT64": "Int64", + "FLOAT": "Float32", + "DOUBLE": "Float64", + "BYTE_ARRAY": "String", + "INT96": "Int64", # TODO! +} + +def read_file(filename): + with open(filename, "rb") as f: + return f.read().decode("raw_unicode_escape") + +def get_column_name(column): + return column["Name"].split(".", 1)[0] + +def resolve_clickhouse_column_type(column): + column_name = get_column_name(column) + logical_type = column.get("LogicalType", {}) + converted_type = column.get("ConvertedType", "").upper() + physical_type = column.get("PhysicalType", "").upper() + if logical_type and logical_type.get("Type", "").upper() == "DECIMAL": + precision = int(logical_type["precision"]) + scale = int(logical_type["scale"]) + if precision < 1 or precision > 76: + raise RuntimeError("Column {} has invalid Decimal precision {}".format(column_name, precision)) + if precision > 38: + raise RuntimeError("Column {} has unsupported Decimal precision {}".format(column_name, precision)) + if scale < 0 or scale > precision: + raise RuntimeError("Column {} has invalid Decimal scale {} for precision {}".format(column_name, scale, precision)) + return "Decimal({}, {})".format(precision, scale) + if converted_type and converted_type != "NONE": + result_type = TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE.get(converted_type) + if result_type: + return result_type + raise RuntimeError("Column {} has unknown ConvertedType: {}".format(column_name, converted_type)) + if physical_type and physical_type != "NONE": + result_type = TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE.get(physical_type) + if result_type: + return result_type + raise RuntimeError("Column {} has unknown PhysicalType: {}".format(column_name, physical_type)) + raise RuntimeError("Column {} has invalid types: ConvertedType={}, PhysicalType={}".format(column_name, converted_type, physical_type)) + +def dump_columns(obj): + descr_by_column_name = {} + columns_descr = [] + for column in obj["Columns"]: + column_name = get_column_name(column) + column_type = resolve_clickhouse_column_type(column) + result_type = "Nullable({})".format(column_type) + if column_name in descr_by_column_name: + descr = descr_by_column_name[column_name] + descr["types"].append(result_type) + else: + descr = { + "name": column_name, + "types": [result_type], + } + descr_by_column_name[column_name] = descr + columns_descr.append(descr) + + # Make tuples from nested types. CH Server doesn't support such Arrow type but it makes Server Exceptions more relevant. + def _format_type(types): + if len(types) == 1: + return types[0] + else: + return "Tuple({})".format(", ".join(types)) + + print(", ".join(map(lambda descr: "`{}` {}".format(descr["name"], _format_type(descr["types"])), columns_descr))) + +def dump_columns_from_file(filename): + dump_columns(json.loads(read_file(filename), strict=False)) + +if __name__ == "__main__": + filename = sys.argv[1] + dump_columns_from_file(filename) From 5978c130acaa977ac03dc05dc5ff4b3467a88e67 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:25:57 +0300 Subject: [PATCH 11/21] Update main test script --- .../queries/0_stateless/00900_parquet_load.sh | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/00900_parquet_load.sh b/tests/queries/0_stateless/00900_parquet_load.sh index 346fa4f915c..59e7a1588b2 100755 --- a/tests/queries/0_stateless/00900_parquet_load.sh +++ b/tests/queries/0_stateless/00900_parquet_load.sh @@ -19,17 +19,20 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CUR_DIR"/../shell_config.sh CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY") -[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../.. -[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../.. -[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../.. +[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../.. +[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../.. DATA_DIR=$CUR_DIR/data_parquet +[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader + # To update: # cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/ -# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]} -# why? repeated_no_annotation.parquet +# There is failure due parsing nested arrays or nested maps with NULLs: +# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) +# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) +# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort); do echo === Try load data from "$NAME" @@ -38,19 +41,19 @@ for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort COLUMNS_FILE=$DATA_DIR/$NAME.columns # If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns - [ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$BUILD_DIR"/contrib/arrow-cmake/parquet-reader --json "$DATA_DIR"/"$NAME" > "$JSON" - [ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.pl "$JSON" > "$COLUMNS_FILE" + [ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON" + [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE" # Debug only: - # [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump + # [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump - #COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue + # COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue COLUMNS=$(cat "$COLUMNS_FILE") || continue ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory" - # Some files is broken, exception is ok. + # Some files contain unsupported data structures, exception is ok. cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/' ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100" From dd81be96db70470a8c3db388126a0f8115a455f1 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 16:26:00 +0300 Subject: [PATCH 12/21] Update reference. Update test data. Add more tests. --- .../0_stateless/00900_parquet_load.reference | 348 +++++++++++------- .../alltypes_dictionary.parquet.columns | 2 +- .../alltypes_plain.parquet.columns | 2 +- .../alltypes_plain.snappy.parquet.columns | 2 +- .../0_stateless/data_parquet/binary.parquet | Bin 0 -> 478 bytes .../data_parquet/binary.parquet.columns | 1 + .../byte_array_decimal.parquet.columns | 2 +- .../datapage_v2.snappy.parquet.columns | 2 +- .../dict-page-offset-zero.parquet | Bin 0 -> 635 bytes .../dict-page-offset-zero.parquet.columns | 1 + .../data_parquet/fixed_length_decimal.parquet | Bin 0 -> 677 bytes .../fixed_length_decimal.parquet.columns | 1 + .../fixed_length_decimal_1.parquet.columns | 2 +- ...ixed_length_decimal_legacy.parquet.columns | 2 +- .../hadoop_lz4_compressed.parquet | Bin 0 -> 702 bytes .../hadoop_lz4_compressed.parquet.columns | 1 + .../int32_decimal.parquet.columns | 2 +- .../int64_decimal.parquet.columns | 2 +- .../data_parquet/list_columns.parquet | Bin 0 -> 2526 bytes .../data_parquet/list_columns.parquet.columns | 1 + .../nation.dict-malformed.parquet.columns | 2 +- .../nested_lists.snappy.parquet.columns | 2 +- .../nested_maps.snappy.parquet.columns | 2 +- .../non_hadoop_lz4_compressed.parquet | Bin 0 -> 1228 bytes .../non_hadoop_lz4_compressed.parquet.columns | 1 + .../nonnullable.impala.parquet.columns | 2 +- .../nullable.impala.parquet.columns | 2 +- .../data_parquet/nulls.snappy.parquet.columns | 2 +- .../repeated_no_annotation.parquet.columns | 2 +- .../data_parquet/single_nan.parquet | Bin 0 -> 660 bytes .../data_parquet/single_nan.parquet.columns | 1 + .../data_parquet/userdata1.parquet.columns | 2 +- .../data_parquet/userdata2.parquet.columns | 2 +- .../data_parquet/userdata3.parquet.columns | 2 +- .../data_parquet/userdata4.parquet.columns | 2 +- .../data_parquet/userdata5.parquet.columns | 2 +- .../v0.7.1.all-named-index.parquet.columns | 2 +- ...1.column-metadata-handling.parquet.columns | 2 +- .../data_parquet/v0.7.1.parquet.columns | 2 +- .../v0.7.1.some-named-index.parquet.columns | 2 +- 40 files changed, 251 insertions(+), 154 deletions(-) create mode 100644 tests/queries/0_stateless/data_parquet/binary.parquet create mode 100644 tests/queries/0_stateless/data_parquet/binary.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet create mode 100644 tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet create mode 100644 tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet create mode 100644 tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/list_columns.parquet create mode 100644 tests/queries/0_stateless/data_parquet/list_columns.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet create mode 100644 tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/single_nan.parquet create mode 100644 tests/queries/0_stateless/data_parquet/single_nan.parquet.columns diff --git a/tests/queries/0_stateless/00900_parquet_load.reference b/tests/queries/0_stateless/00900_parquet_load.reference index 6cd2b1cf462..6c5f42cbd63 100644 --- a/tests/queries/0_stateless/00900_parquet_load.reference +++ b/tests/queries/0_stateless/00900_parquet_load.reference @@ -13,134 +13,220 @@ === Try load data from alltypes_plain.snappy.parquet 6 1 0 0 0 0 0 0 04/01/09 0 1238544000 7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060 +=== Try load data from binary.parquet +\0 + + + + + + + +\b +\t +\n + === Try load data from byte_array_decimal.parquet -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 === Try load data from datapage_v2.snappy.parquet Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: data for INSERT was parsed from stdin +=== Try load data from dict-page-offset-zero.parquet +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 +1552 === Try load data from fixed_length_decimal_1.parquet -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 === Try load data from fixed_length_decimal_legacy.parquet -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 +=== Try load data from fixed_length_decimal.parquet +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 +=== Try load data from hadoop_lz4_compressed.parquet +1593604800 abc 42 +1593604800 def 7.7 +1593604801 abc 42.125 +1593604801 def 7.7 === Try load data from int32_decimal.parquet -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 === Try load data from int64_decimal.parquet -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 +=== Try load data from list_columns.parquet +Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin + === Try load data from nation.dict-malformed.parquet 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon @@ -168,23 +254,27 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl 23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull 24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be === Try load data from nested_lists.snappy.parquet -Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin +Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin === Try load data from nested_maps.snappy.parquet -Code: 33. DB::Ex---tion: Error while reading Parquet data: NotImplemented: Reading lists of structs from Parquet files not yet supported: key_value: list not null> not null>> not null> not null: data for INSERT was parsed from stdin +Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin +=== Try load data from non_hadoop_lz4_compressed.parquet +1593604800 abc 42 +1593604800 def 7.7 +1593604801 abc 42.125 +1593604801 def 7.7 === Try load data from nonnullable.impala.parquet -Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin - +../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) === Try load data from nullable.impala.parquet -Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin - +../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) === Try load data from nulls.snappy.parquet -Code: 8. DB::Ex---tion: Column "b_c_int" is not presented in input data: data for INSERT was parsed from stdin +Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin === Try load data from repeated_no_annotation.parquet -Code: 8. DB::Ex---tion: Column "number" is not presented in input data: data for INSERT was parsed from stdin - +../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) +=== Try load data from single_nan.parquet +\N === Try load data from userdata1.parquet 1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02 1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV diff --git a/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns index e13d779fda2..cbc891b2ca7 100644 --- a/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/alltypes_dictionary.parquet.columns @@ -1 +1 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) \ No newline at end of file +`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns index e13d779fda2..cbc891b2ca7 100644 --- a/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/alltypes_plain.parquet.columns @@ -1 +1 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) \ No newline at end of file +`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns index e13d779fda2..cbc891b2ca7 100644 --- a/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/alltypes_plain.snappy.parquet.columns @@ -1 +1 @@ -`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) \ No newline at end of file +`id` Nullable(Int32), `bool_col` Nullable(UInt8), `tinyint_col` Nullable(Int32), `smallint_col` Nullable(Int32), `int_col` Nullable(Int32), `bigint_col` Nullable(Int64), `float_col` Nullable(Float32), `double_col` Nullable(Float64), `date_string_col` Nullable(String), `string_col` Nullable(String), `timestamp_col` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/binary.parquet b/tests/queries/0_stateless/data_parquet/binary.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fc8c04669d126830f81553e8ac446860dd67c78e GIT binary patch literal 478 zcmY+BT}#6-6o%7IXV>jUG$VmtgrQ@?EbYD>yV;~zbW|qPqE^+gaEZ3_2BKL4L%jT>!K@mC zKR46@e+UX7q>=>D0U?_}zHWJHZ=^_tz2M1QMw?Vf^1KdKyw57xs}*w@@p!>LO~&?d zc)y}f+f~SApShspAmnkMjfW#|{4(*Ux0Czf7~<*s1}aTwzV|YkO}$6+%S7b(O-3n~ z_DU>xY^$Qg$V`OWB_b#Pea+(_Nyw?{LKuA5*FwN{mRUpOZh!Z3CE1S4Bob@p+OA_e ztQKw~u4z*2o5aTsTAKh&9=x_y*J)&#)Y!iH0tj=$W~8UgKngLCu_YOym>Tg4J`Y&@^dTd z0Uk_`r4EyIwUv#TpdA~$G~9t)o*ck^IL-A6BicB)XLN>`V7DbG5#`pz|vTRjm7g5fy$k}FZS|F9ESQ%$rU3;+Vbcg~z4_$c_zDIE*GaW3eB4}xEu`9r~1&c$`XJHb!R!mi*8=aMW) z1wT0DZNVqOAI{|tlvHDXEg9hW>OAo;Nra&UU=fy}q%7WGnvsnan1>E1S*oULWm$to z)s+iCorWQclz#JUl;xed)Bb<-s>Xg0VC4ZAFR*~BDb;>_Vd4R1JDZax%;cJytefkM zq-NZ|H6xPZ0H;{-hA6!s5Y$3R8fSir3>0<=4HDEk9ikqN#-T~aEHcduh~JGctvIgX zRM~VdqZE6TWwH{)K(pegH5`r3MnE~3ZoL`B`VEPbZojA7X2W#!ikEe1p!+q~aq5jm m&1n+1X<>JZc=cwZzU?*Lu;y=tp>2nD&98zJcE{T6g8l(ebZuw= literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns new file mode 100644 index 00000000000..469105337a6 --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns @@ -0,0 +1 @@ +`value` Nullable(Decimal(25, 2)) diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns index 668abaf93c3..469105337a6 100644 --- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns @@ -1 +1 @@ -`value` Nullable(Decimal128(1)) \ No newline at end of file +`value` Nullable(Decimal(25, 2)) diff --git a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns index 668abaf93c3..5e61877db58 100644 --- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns @@ -1 +1 @@ -`value` Nullable(Decimal128(1)) \ No newline at end of file +`value` Nullable(Decimal(13, 2)) diff --git a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b5fadcd49c332450a97efd144b3cd7bcd0d6f27f GIT binary patch literal 702 zcmb7CJ4?hs5T4D3V~K|^oFxlE(p)$}kF2NBK#EuhB3Fn%K)76TAox5Vh(Ex}-yr@D zEA6$nw6wIcvT!zM5Fa28+05=Y-+VhW$;Qg2MjL%x=#aC)*f_1p7j}F8wq=fYA07c~ z=+2@bqCnxfqKdep;!0@#f*KV7PRkA%uT#ELDKPeA!riQATwe6QUlu`=6YO%L z5QD6eb4V`4q(j9G%XcX#CxAh^nm*h!xSmc8AI4WxQk}wJ*g~yrp^CWT;ojJEYFhY- zmIyfHMy@S-^D&F(fR^g4(O3ZO2Dnx)1gw`E4d^s1m!E14QYcR)f$-2)4Ph(8Lq`gE z!{ieSMDmY8s#)`u&;shjIDp`pd04N6bt!x@K1}u(f0nHuUk57X}_7BZ13cLcR9}c t%l$ZR&u3B5kD_)v52G-Oi^Z@PXG`%?m=#&BXH?<=3?DdS02}n@z5@p&dN}|9 literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns new file mode 100644 index 00000000000..5a0c330c88f --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns @@ -0,0 +1 @@ +`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64) diff --git a/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns index 668abaf93c3..cb2a97de8c4 100644 --- a/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns @@ -1 +1 @@ -`value` Nullable(Decimal128(1)) \ No newline at end of file +`value` Nullable(Decimal(4, 2)) diff --git a/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns b/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns index 668abaf93c3..3624a571970 100644 --- a/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns @@ -1 +1 @@ -`value` Nullable(Decimal128(1)) \ No newline at end of file +`value` Nullable(Decimal(10, 2)) diff --git a/tests/queries/0_stateless/data_parquet/list_columns.parquet b/tests/queries/0_stateless/data_parquet/list_columns.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ecd7597e2fbe154c86724136ebc77a2b2f5be496 GIT binary patch literal 2526 zcmcIm-EQJW6gFV7Nfou*N=6`Y5m{Zd7unSOrOB$RW*`Ba0!f^#ZP;B!{sT;G<6wg! zX{i@Y9sY$EcHh$_d2)saRGtk zF7temS2m3UE21hMQRv4Np$J2L8z{Htk z^Bag)rqiSxOq^oqC-F3m%YvQX1%1$WW8wRRYGwf^mCR%_TmNsx|8r4NKPFO1DzlYN zx{9MKUiugMhBY1Cm|bWiYhpVd`+W1&4SNYXn6YEU=^O0UChI7+0SaegSs;17GWJ2_ z$b96&v`D4~p5|}gfDqzc5O5=cT}A1chNWMu>Oxu<{@lgk*ok9mBRym=vEBL7Fj9wx z=3{2I;VZi0E8u>^8Vye`8&aEp zXNK!`y9ETJY}w_3en0ez>7mF`Q9dd**^#lK_@gl?lx3=8Wjx1qa_7o;JPI(8sWJA< zk%P6|cD}rwXMr>uaab`H%G*WumP&6RK{%G4hfS7#yZ#k^O6UL~bP7NGe1}j8Rr;ug zP#;zML{~wM7V5)@+R5iF`81aAydY2g>h2pt6AU-_E1tx2~omgGWv zsy0Mlbz0`JSy7Go(O8knm;7+l=bfX0YU>tn27TTx4^-(T_L=C?%xhMWsTGBR=A42K z1(7_}s{9Ur(DL}|(Y0O|1B}y>^E$S73VWU|jNO%&F((}jyWMym&z{SUv|%U92+?114G>747oyZd3(=PY;mYpiQVtY z!T>P1&KdA+cxG2B57mO;s`{H7h- zwK@@wDv4Z25(awxaJfIdv!D-q`7XXAa^2zJ!yMoD2|92VI&%ANY)_HfoKzGiiqxLM z8PcQkkyqtP`R<;mg`=yk)aF8a-qPzez!C<_eQreOqIhJ1eyW+_UO){@&92RjFxSA4 zYIDurD-s;1YiUl~RSPAuPpxWc);=rEc!$IL{*JFL&b5LyRWW9j51Ko!~VgoNBRn(s}peAIfr2vREITaT6h-yNd$yi z#7-6|vS6uU{3C4DWx+nK?U8z7xrp)b^3&NRmP0(UfV6G{D}f&f1P_LnxWD1P027Hz zjKmEcR!NWwSx@{!eIH*qa4%?K(nJ|XB0=gF*OfS;CHGH#<=le)#S1D8*WCDOV))NA l*WHZny3PDHT!PQe+UVr0aeSWHf)73Z*1uk(s1x{S_YZBXv?~Ar literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns b/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns new file mode 100644 index 00000000000..86745c2f074 --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns @@ -0,0 +1 @@ +`int64_list` Nullable(Int64), `utf8_list` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/nation.dict-malformed.parquet.columns b/tests/queries/0_stateless/data_parquet/nation.dict-malformed.parquet.columns index dbfe40811d7..34513a1a8e2 100644 --- a/tests/queries/0_stateless/data_parquet/nation.dict-malformed.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nation.dict-malformed.parquet.columns @@ -1 +1 @@ -`nation_key` Nullable(Int32), `name` Nullable(String), `region_key` Nullable(Int32), `comment_col` Nullable(String) \ No newline at end of file +`nation_key` Nullable(Int32), `name` Nullable(String), `region_key` Nullable(Int32), `comment_col` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns index e939719c71c..6d55d46dd5b 100644 --- a/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns @@ -1 +1 @@ -`element` Nullable(String), `b` Nullable(Int32) \ No newline at end of file +`a` Nullable(String), `b` Nullable(Int32) diff --git a/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns b/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns index c0ac26b2478..d5e9599431b 100644 --- a/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns @@ -1 +1 @@ -`key` Nullable(String), `key1` Nullable(Int32), `value` Nullable(UInt8), `b` Nullable(Int32), `c` Nullable(Float64) \ No newline at end of file +`a` Tuple(Nullable(String), Nullable(Int32), Nullable(UInt8)), `b` Nullable(Int32), `c` Nullable(Float64) diff --git a/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet b/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cfbdc7ef2db3aa70119e5941701a0fb1647e2de3 GIT binary patch literal 1228 zcmcIk!EVz)5S_JM%ZhSN|T4;eU z;D8Y4p85ej@da?}q2-7WC&U*Z&K#Jr;}B>#Ak?R5XZOwQ+qbiFI;!iVfh9cKq+rYf zd=u|p{I~;z_oIk_vsfsjMHMERu5D!*+sbMs%uQiQ$J_;8Nr)q%d*jiM< zoV4VDkROEC@T`APb;|4&H6UjkfP;(0>JLqBo&Qpi`rJJ7w?y8GMCsV*v5Zc>xDk*7gGP!Em{1?&|AItbuYJs>j$l#|WO zGPZSe$S4Nf%;PLEducN@eI2~-ri;0eSAE-0&3+d0lkmRIc)K&y(Bm3^QJ!)Y&1qJ_ zHn*aCgNJ=Isl(?qel@^+GVuX(G~ts$T8`^1p5*)f@5`KUx8z$xT6BMk1pvpj^Qzlz z-@cS~&)(P0GcLx8u)+i3x>7qTjMWvTVpU60DkQ4K$(mAEIflO?tvD4{m~ZotNYyr2 zniipYo{+bAq9r7l#C!*qob-PN#ERH4|)iN60c#@0mx8U__dyc-M_gY&*8|9~*OFVq`m(kuc%2 z)cQKu(m?q9a|u(kNTl?#{kWyCpg}|>6B|yuZ7NvAK@eFOLY68f6{LeQjuQyoGUR1g zJV8?VElYF0v#qjK=Ri5Z^G!uar`xQYJCHYSwUcGQnX|jkb<}<~heM`B3 z_CRd(-G}*A?}8S4Fm(!AZdzN^Kz`*vEbB04YCJ5rQDfOF?7`?&RkI~$DwZW*7_20d zxyDO!CT0Ag?cTXegIH*F(07OZ0WCn?jjt8YMtzqS$bzDL%ed-!Nz&eRwB0^bd$Zfg Tbas6=w;cyw*Eu||DZIyTtqren literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/single_nan.parquet.columns b/tests/queries/0_stateless/data_parquet/single_nan.parquet.columns new file mode 100644 index 00000000000..f2bb48365be --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/single_nan.parquet.columns @@ -0,0 +1 @@ +`mycol` Nullable(Float64) diff --git a/tests/queries/0_stateless/data_parquet/userdata1.parquet.columns b/tests/queries/0_stateless/data_parquet/userdata1.parquet.columns index a1fa01e30ae..93e617f0467 100644 --- a/tests/queries/0_stateless/data_parquet/userdata1.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/userdata1.parquet.columns @@ -1 +1 @@ -`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) \ No newline at end of file +`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/userdata2.parquet.columns b/tests/queries/0_stateless/data_parquet/userdata2.parquet.columns index a1fa01e30ae..93e617f0467 100644 --- a/tests/queries/0_stateless/data_parquet/userdata2.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/userdata2.parquet.columns @@ -1 +1 @@ -`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) \ No newline at end of file +`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/userdata3.parquet.columns b/tests/queries/0_stateless/data_parquet/userdata3.parquet.columns index a1fa01e30ae..93e617f0467 100644 --- a/tests/queries/0_stateless/data_parquet/userdata3.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/userdata3.parquet.columns @@ -1 +1 @@ -`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) \ No newline at end of file +`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/userdata4.parquet.columns b/tests/queries/0_stateless/data_parquet/userdata4.parquet.columns index a1fa01e30ae..93e617f0467 100644 --- a/tests/queries/0_stateless/data_parquet/userdata4.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/userdata4.parquet.columns @@ -1 +1 @@ -`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) \ No newline at end of file +`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/userdata5.parquet.columns b/tests/queries/0_stateless/data_parquet/userdata5.parquet.columns index a1fa01e30ae..93e617f0467 100644 --- a/tests/queries/0_stateless/data_parquet/userdata5.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/userdata5.parquet.columns @@ -1 +1 @@ -`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) \ No newline at end of file +`registration_dttm` Nullable(Int64), `id` Nullable(Int32), `first_name` Nullable(String), `last_name` Nullable(String), `email` Nullable(String), `gender` Nullable(String), `ip_address` Nullable(String), `cc` Nullable(String), `country` Nullable(String), `birthdate` Nullable(String), `salary` Nullable(Float64), `title` Nullable(String), `comments` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/v0.7.1.all-named-index.parquet.columns b/tests/queries/0_stateless/data_parquet/v0.7.1.all-named-index.parquet.columns index bcb204a577a..3f152e35001 100644 --- a/tests/queries/0_stateless/data_parquet/v0.7.1.all-named-index.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/v0.7.1.all-named-index.parquet.columns @@ -1 +1 @@ -`carat` Nullable(Float64), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `cut` Nullable(String), `color` Nullable(String), `clarity` Nullable(String) \ No newline at end of file +`carat` Nullable(Float64), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `cut` Nullable(String), `color` Nullable(String), `clarity` Nullable(String) diff --git a/tests/queries/0_stateless/data_parquet/v0.7.1.column-metadata-handling.parquet.columns b/tests/queries/0_stateless/data_parquet/v0.7.1.column-metadata-handling.parquet.columns index b79ebb7e612..3d08da2522c 100644 --- a/tests/queries/0_stateless/data_parquet/v0.7.1.column-metadata-handling.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/v0.7.1.column-metadata-handling.parquet.columns @@ -1 +1 @@ -`a` Nullable(Int64), `b` Nullable(Float64), `c` Nullable(DateTime), `index` Nullable(String), `__index_level_1__` Nullable(DateTime) \ No newline at end of file +`a` Nullable(Int64), `b` Nullable(Float64), `c` Nullable(DateTime), `index` Nullable(String), `__index_level_1__` Nullable(DateTime) diff --git a/tests/queries/0_stateless/data_parquet/v0.7.1.parquet.columns b/tests/queries/0_stateless/data_parquet/v0.7.1.parquet.columns index 70c607038ee..57a97f5fce9 100644 --- a/tests/queries/0_stateless/data_parquet/v0.7.1.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/v0.7.1.parquet.columns @@ -1 +1 @@ -`carat` Nullable(Float64), `cut` Nullable(String), `color` Nullable(String), `clarity` Nullable(String), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `__index_level_0__` Nullable(Int64) \ No newline at end of file +`carat` Nullable(Float64), `cut` Nullable(String), `color` Nullable(String), `clarity` Nullable(String), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `__index_level_0__` Nullable(Int64) diff --git a/tests/queries/0_stateless/data_parquet/v0.7.1.some-named-index.parquet.columns b/tests/queries/0_stateless/data_parquet/v0.7.1.some-named-index.parquet.columns index cde1175cb0d..50b4cb1dfbc 100644 --- a/tests/queries/0_stateless/data_parquet/v0.7.1.some-named-index.parquet.columns +++ b/tests/queries/0_stateless/data_parquet/v0.7.1.some-named-index.parquet.columns @@ -1 +1 @@ -`carat` Nullable(Float64), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `cut` Nullable(String), `__index_level_1__` Nullable(String), `clarity` Nullable(String) \ No newline at end of file +`carat` Nullable(Float64), `depth` Nullable(Float64), `table` Nullable(Float64), `price` Nullable(Int64), `x` Nullable(Float64), `y` Nullable(Float64), `z` Nullable(Float64), `cut` Nullable(String), `__index_level_1__` Nullable(String), `clarity` Nullable(String) From 6cf4d58ce1bdd00c10e78bc37a293e63b8234767 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 17:27:25 +0300 Subject: [PATCH 13/21] Fix parquet reader in submodule --- .gitmodules | 2 +- contrib/arrow | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 591139856bd..2c1b15ffecb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -53,7 +53,7 @@ [submodule "contrib/arrow"] path = contrib/arrow url = https://github.com/ClickHouse-Extras/arrow - branch = clickhouse-fix-ipv6 + branch = clickhouse-arrow-2.0.0 [submodule "contrib/thrift"] path = contrib/thrift url = https://github.com/apache/thrift.git diff --git a/contrib/arrow b/contrib/arrow index b693865b56f..b5af74834d6 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit b693865b56fba31746278bb9c03bf0c9149fe19a +Subproject commit b5af74834d6ee623767143b06e8a83f7e9e79e07 From bd1daf93fad14792cf4d610efb150765699fc98a Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 17:32:45 +0300 Subject: [PATCH 14/21] Set correct sha for submodule --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index b5af74834d6..744bdfe188f 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit b5af74834d6ee623767143b06e8a83f7e9e79e07 +Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4 From 65e4fa555b2c72627304c5a434c1af67682146a9 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 19:49:36 +0300 Subject: [PATCH 15/21] Remove build branch with internal parquet native cmake --- cmake/find/parquet.cmake | 8 ------ contrib/CMakeLists.txt | 54 +++++++++------------------------------- src/CMakeLists.txt | 2 +- 3 files changed, 13 insertions(+), 51 deletions(-) diff --git a/cmake/find/parquet.cmake b/cmake/find/parquet.cmake index 6d05fa17aec..eb1b529fbfe 100644 --- a/cmake/find/parquet.cmake +++ b/cmake/find/parquet.cmake @@ -141,11 +141,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O else() set(USE_INTERNAL_PARQUET_LIBRARY 1) - if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) - set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src") - set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src) - endif() - if(MAKE_STATIC_LIBRARIES) set(FLATBUFFERS_LIBRARY flatbuffers) set(ARROW_LIBRARY arrow_static) @@ -155,9 +150,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O set(FLATBUFFERS_LIBRARY flatbuffers_shared) set(ARROW_LIBRARY arrow_shared) set(PARQUET_LIBRARY parquet_shared) - if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) - list(APPEND PARQUET_LIBRARY boost::regex) - endif() set(THRIFT_LIBRARY thrift) endif() diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 7d6b9c0e374..1d763239d27 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -163,51 +163,22 @@ if(USE_INTERNAL_SNAPPY_LIBRARY) endif() if (USE_INTERNAL_PARQUET_LIBRARY) -if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) # We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time - # But this mode can be used for updating auto-generated parquet files: - # cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0 - # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/ + # But you can update auto-generated parquet files manually: + # cd {BUILD_DIR}/contrib/arrow/cpp/src/parquet && mkdir -p build && cd build + # cmake .. -DARROW_COMPUTE=ON -DARROW_PARQUET=ON -DARROW_SIMD_LEVEL=NONE -DARROW_VERBOSE_THIRDPARTY_BUILD=ON + # -DARROW_BUILD_SHARED=1 -DARROW_BUILD_UTILITIES=OFF -DARROW_BUILD_INTEGRATION=OFF + # -DBoost_FOUND=1 -DARROW_TEST_LINKAGE="shared" + # make -j8 + # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> {BUILD_DIR}/contrib/arrow-cmake/cpp/src/parquet/ # Also useful parquet reader: - # cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8 - # contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet + # cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build + # cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1 + # make -j8 + # cd contrib/arrow/cpp && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8 + # {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet - set (ARROW_COMPUTE ON CACHE INTERNAL "") - set (ARROW_PARQUET ON CACHE INTERNAL "") - set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "") - set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "") - set (ARROW_BUILD_UTILITIES OFF CACHE INTERNAL "") - set (ARROW_BUILD_INTEGRATION OFF CACHE INTERNAL "") - set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "") - set (Boost_FOUND 1 CACHE INTERNAL "") - if (MAKE_STATIC_LIBRARIES) - set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "") - set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "") - set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "") - else () - set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "") - set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "") - endif () - - if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO") - set (_save_build_type ${CMAKE_BUILD_TYPE}) - set (CMAKE_BUILD_TYPE Release) - string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) - endif () - - # Because Arrow uses CMAKE_SOURCE_DIR as a project path - # Hopefully will be fixed in https://github.com/apache/arrow/pull/2676 - set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules") - add_subdirectory (arrow/cpp) - - if (_save_build_type) - set (CMAKE_BUILD_TYPE ${_save_build_type}) - unset (_save_build_type) - string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC) - endif () - -else() add_subdirectory(arrow-cmake) # The library is large - avoid bloat. @@ -215,7 +186,6 @@ else() target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0) target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0) endif() -endif() if (USE_INTERNAL_AVRO_LIBRARY) add_subdirectory(avro-cmake) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 085269847e4..21ce5145699 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -340,7 +340,7 @@ endif () if (USE_PARQUET) dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY}) - if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) + if (NOT USE_INTERNAL_PARQUET_LIBRARY) dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR}) if (USE_STATIC_LIBRARIES) dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY}) From 42c9dba86ec907948b36e94ec525d9d32dc27169 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sat, 7 Nov 2020 21:01:30 +0300 Subject: [PATCH 16/21] Remove irrelevant comment --- tests/queries/0_stateless/00900_parquet_create_table_columns.py | 1 - tests/queries/0_stateless/00900_parquet_load.sh | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/queries/0_stateless/00900_parquet_create_table_columns.py b/tests/queries/0_stateless/00900_parquet_create_table_columns.py index 44dc6d6df64..1a41da8c8b4 100755 --- a/tests/queries/0_stateless/00900_parquet_create_table_columns.py +++ b/tests/queries/0_stateless/00900_parquet_create_table_columns.py @@ -4,7 +4,6 @@ import json import sys TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE = { - "DECIMAL": "Decimal128(1)", "TIMESTAMP_MICROS": "DateTime", "TIMESTAMP_MILLIS": "DateTime", "UTF8": "String", diff --git a/tests/queries/0_stateless/00900_parquet_load.sh b/tests/queries/0_stateless/00900_parquet_load.sh index 59e7a1588b2..aa9a0a9d659 100755 --- a/tests/queries/0_stateless/00900_parquet_load.sh +++ b/tests/queries/0_stateless/00900_parquet_load.sh @@ -5,8 +5,6 @@ # TODO: Add more files. # -# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl - # Also 5 sample files from # wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet # ... From 1d7614746add05297aa44968defd1ff19bcceb72 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sun, 8 Nov 2020 04:32:27 +0300 Subject: [PATCH 17/21] Fix issue with locale. Move helper script out of tests folder --- .../0_stateless/00900_parquet_load.reference | 50 +++++++++---------- .../queries/0_stateless/00900_parquet_load.sh | 4 +- .../00900_parquet_create_table_columns.py | 0 3 files changed, 27 insertions(+), 27 deletions(-) rename tests/queries/0_stateless/{ => helpers}/00900_parquet_create_table_columns.py (100%) diff --git a/tests/queries/0_stateless/00900_parquet_load.reference b/tests/queries/0_stateless/00900_parquet_load.reference index 6c5f42cbd63..3311e03d7f8 100644 --- a/tests/queries/0_stateless/00900_parquet_load.reference +++ b/tests/queries/0_stateless/00900_parquet_load.reference @@ -94,6 +94,31 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl 1552 1552 1552 +=== Try load data from fixed_length_decimal.parquet +1.00 +2.00 +3.00 +4.00 +5.00 +6.00 +7.00 +8.00 +9.00 +10.00 +11.00 +12.00 +13.00 +14.00 +15.00 +16.00 +17.00 +18.00 +19.00 +20.00 +21.00 +22.00 +23.00 +24.00 === Try load data from fixed_length_decimal_1.parquet 1.00 2.00 @@ -144,31 +169,6 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl 22.00 23.00 24.00 -=== Try load data from fixed_length_decimal.parquet -1.00 -2.00 -3.00 -4.00 -5.00 -6.00 -7.00 -8.00 -9.00 -10.00 -11.00 -12.00 -13.00 -14.00 -15.00 -16.00 -17.00 -18.00 -19.00 -20.00 -21.00 -22.00 -23.00 -24.00 === Try load data from hadoop_lz4_compressed.parquet 1593604800 abc 42 1593604800 def 7.7 diff --git a/tests/queries/0_stateless/00900_parquet_load.sh b/tests/queries/0_stateless/00900_parquet_load.sh index aa9a0a9d659..290a581aead 100755 --- a/tests/queries/0_stateless/00900_parquet_load.sh +++ b/tests/queries/0_stateless/00900_parquet_load.sh @@ -32,7 +32,7 @@ DATA_DIR=$CUR_DIR/data_parquet # ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) # ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue -for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort); do +for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do echo === Try load data from "$NAME" JSON=$DATA_DIR/$NAME.json @@ -40,7 +40,7 @@ for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort # If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns [ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON" - [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE" + [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE" # Debug only: # [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump diff --git a/tests/queries/0_stateless/00900_parquet_create_table_columns.py b/tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py similarity index 100% rename from tests/queries/0_stateless/00900_parquet_create_table_columns.py rename to tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py From 43799204f9a21bb291d75b8fe0c0dc4f8c5d98c1 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sun, 8 Nov 2020 05:16:41 +0300 Subject: [PATCH 18/21] Remove misleading comment --- contrib/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 96d8084d6c9..bec5f42c3e0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -178,7 +178,6 @@ if (USE_INTERNAL_PARQUET_LIBRARY) # cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build # cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1 # make -j8 - # cd contrib/arrow/cpp && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8 # {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet add_subdirectory(arrow-cmake) From a8dbbc6ccb983c7a92f37d31d721f88f445ece6f Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sun, 8 Nov 2020 05:26:38 +0300 Subject: [PATCH 19/21] Update reference with release build --- tests/queries/0_stateless/00900_parquet_load.reference | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00900_parquet_load.reference b/tests/queries/0_stateless/00900_parquet_load.reference index 3311e03d7f8..cc83ad19d44 100644 --- a/tests/queries/0_stateless/00900_parquet_load.reference +++ b/tests/queries/0_stateless/00900_parquet_load.reference @@ -272,7 +272,8 @@ Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin === Try load data from repeated_no_annotation.parquet -../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) +Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin + === Try load data from single_nan.parquet \N === Try load data from userdata1.parquet From 4526cae4b9e1ceae756304d2c9c8d96ff2774cb6 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sun, 8 Nov 2020 05:27:33 +0300 Subject: [PATCH 20/21] Add more comments in test launcher --- tests/queries/0_stateless/00900_parquet_load.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00900_parquet_load.sh b/tests/queries/0_stateless/00900_parquet_load.sh index 290a581aead..58fbea9f337 100755 --- a/tests/queries/0_stateless/00900_parquet_load.sh +++ b/tests/queries/0_stateless/00900_parquet_load.sh @@ -27,10 +27,15 @@ DATA_DIR=$CUR_DIR/data_parquet # To update: # cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/ +# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue. # There is failure due parsing nested arrays or nested maps with NULLs: # ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) -# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) -# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue + +# Strange behaviour for repeated_no_annotation.parquet: +# debug: +# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) +# release: +# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do echo === Try load data from "$NAME" From b3d94fae5f66133bd3b7e68071c4b8b9e5653325 Mon Sep 17 00:00:00 2001 From: FawnD2 Date: Sun, 8 Nov 2020 15:00:55 +0300 Subject: [PATCH 21/21] Disable input file with strange behaviour --- .../0_stateless/00900_parquet_load.reference | 3 --- tests/queries/0_stateless/00900_parquet_load.sh | 2 +- .../repeated_no_annotation.parquet.columns | 1 - ...quet => repeated_no_annotation.parquet.disabled} | Bin 4 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns rename tests/queries/0_stateless/data_parquet/{repeated_no_annotation.parquet => repeated_no_annotation.parquet.disabled} (100%) diff --git a/tests/queries/0_stateless/00900_parquet_load.reference b/tests/queries/0_stateless/00900_parquet_load.reference index cc83ad19d44..f93be897da8 100644 --- a/tests/queries/0_stateless/00900_parquet_load.reference +++ b/tests/queries/0_stateless/00900_parquet_load.reference @@ -271,9 +271,6 @@ Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported === Try load data from nulls.snappy.parquet Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin -=== Try load data from repeated_no_annotation.parquet -Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin - === Try load data from single_nan.parquet \N === Try load data from userdata1.parquet diff --git a/tests/queries/0_stateless/00900_parquet_load.sh b/tests/queries/0_stateless/00900_parquet_load.sh index 58fbea9f337..43b738aab83 100755 --- a/tests/queries/0_stateless/00900_parquet_load.sh +++ b/tests/queries/0_stateless/00900_parquet_load.sh @@ -31,7 +31,7 @@ DATA_DIR=$CUR_DIR/data_parquet # There is failure due parsing nested arrays or nested maps with NULLs: # ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) -# Strange behaviour for repeated_no_annotation.parquet: +# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled: # debug: # ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type) # release: diff --git a/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns b/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns deleted file mode 100644 index 67c57236e9d..00000000000 --- a/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`id` Nullable(Int32), `phoneNumbers` Tuple(Nullable(Int64), Nullable(String)) diff --git a/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet b/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.disabled similarity index 100% rename from tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet rename to tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.disabled