Merge pull request #16664 from FawnD2/switch-upstream-for-arrow-submodule

Switch upstream repo for Arrow submodule
This commit is contained in:
alexey-milovidov 2020-12-22 10:55:23 +03:00 committed by GitHub
commit a4b0d9ba4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
53 changed files with 448 additions and 336 deletions

3
.gitmodules vendored
View File

@ -53,7 +53,8 @@
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/apache/arrow
url = https://github.com/ClickHouse-Extras/arrow
branch = clickhouse-arrow-2.0.0
[submodule "contrib/thrift"]
path = contrib/thrift
url = https://github.com/apache/thrift.git

View File

@ -141,11 +141,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
else()
set(USE_INTERNAL_PARQUET_LIBRARY 1)
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src")
set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src)
endif()
if(MAKE_STATIC_LIBRARIES)
set(FLATBUFFERS_LIBRARY flatbuffers)
set(ARROW_LIBRARY arrow_static)
@ -155,9 +150,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
set(FLATBUFFERS_LIBRARY flatbuffers_shared)
set(ARROW_LIBRARY arrow_shared)
set(PARQUET_LIBRARY parquet_shared)
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
list(APPEND PARQUET_LIBRARY boost::regex)
endif()
set(THRIFT_LIBRARY thrift)
endif()

View File

@ -163,51 +163,21 @@ if(USE_INTERNAL_SNAPPY_LIBRARY)
endif()
if (USE_INTERNAL_PARQUET_LIBRARY)
if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
# We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
# But this mode can be used for updating auto-generated parquet files:
# cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0
# copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/
# But you can update auto-generated parquet files manually:
# cd {BUILD_DIR}/contrib/arrow/cpp/src/parquet && mkdir -p build && cd build
# cmake .. -DARROW_COMPUTE=ON -DARROW_PARQUET=ON -DARROW_SIMD_LEVEL=NONE -DARROW_VERBOSE_THIRDPARTY_BUILD=ON
# -DARROW_BUILD_SHARED=1 -DARROW_BUILD_UTILITIES=OFF -DARROW_BUILD_INTEGRATION=OFF
# -DBoost_FOUND=1 -DARROW_TEST_LINKAGE="shared"
# make -j8
# copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> {BUILD_DIR}/contrib/arrow-cmake/cpp/src/parquet/
# Also useful parquet reader:
# cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8
# contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet
# cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build
# cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1
# make -j8
# {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet
set (ARROW_COMPUTE ON CACHE INTERNAL "")
set (ARROW_PARQUET ON CACHE INTERNAL "")
set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "")
set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "")
set (ARROW_BUILD_UTILITIES OFF CACHE INTERNAL "")
set (ARROW_BUILD_INTEGRATION OFF CACHE INTERNAL "")
set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "")
set (Boost_FOUND 1 CACHE INTERNAL "")
if (MAKE_STATIC_LIBRARIES)
set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "")
set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "")
set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "")
else ()
set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "")
set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "")
endif ()
if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO")
set (_save_build_type ${CMAKE_BUILD_TYPE})
set (CMAKE_BUILD_TYPE Release)
string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
endif ()
# Because Arrow uses CMAKE_SOURCE_DIR as a project path
# Hopefully will be fixed in https://github.com/apache/arrow/pull/2676
set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules")
add_subdirectory (arrow/cpp)
if (_save_build_type)
set (CMAKE_BUILD_TYPE ${_save_build_type})
unset (_save_build_type)
string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
endif ()
else()
add_subdirectory(arrow-cmake)
# The library is large - avoid bloat.
@ -215,7 +185,6 @@ else()
target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0)
target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0)
endif()
endif()
if (USE_INTERNAL_AVRO_LIBRARY)
add_subdirectory(avro-cmake)

2
contrib/arrow vendored

@ -1 +1 @@
Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b
Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4

View File

@ -144,15 +144,16 @@ set(ORC_SRCS
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)
configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/arrow/util/config.h")
configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h")
# arrow/cpp/src/arrow/CMakeLists.txt
set(ARROW_SRCS
${LIBRARY_DIR}/array.cc
${LIBRARY_DIR}/buffer.cc
${LIBRARY_DIR}/device.cc
${LIBRARY_DIR}/builder.cc
${LIBRARY_DIR}/chunked_array.cc
${LIBRARY_DIR}/compare.cc
${LIBRARY_DIR}/datum.cc
${LIBRARY_DIR}/device.cc
${LIBRARY_DIR}/extension_type.cc
${LIBRARY_DIR}/memory_pool.cc
${LIBRARY_DIR}/pretty_print.cc
@ -167,11 +168,12 @@ set(ARROW_SRCS
${LIBRARY_DIR}/type.cc
${LIBRARY_DIR}/visitor.cc
${LIBRARY_DIR}/tensor/coo_converter.cc
${LIBRARY_DIR}/tensor/csc_converter.cc
${LIBRARY_DIR}/tensor/csf_converter.cc
${LIBRARY_DIR}/tensor/csr_converter.cc
${LIBRARY_DIR}/array/array_base.cc
${LIBRARY_DIR}/array/array_binary.cc
${LIBRARY_DIR}/array/array_decimal.cc
${LIBRARY_DIR}/array/array_dict.cc
${LIBRARY_DIR}/array/array_nested.cc
${LIBRARY_DIR}/array/array_primitive.cc
${LIBRARY_DIR}/array/builder_adaptive.cc
${LIBRARY_DIR}/array/builder_base.cc
${LIBRARY_DIR}/array/builder_binary.cc
@ -181,17 +183,50 @@ set(ARROW_SRCS
${LIBRARY_DIR}/array/builder_primitive.cc
${LIBRARY_DIR}/array/builder_union.cc
${LIBRARY_DIR}/array/concatenate.cc
${LIBRARY_DIR}/array/dict_internal.cc
${LIBRARY_DIR}/array/data.cc
${LIBRARY_DIR}/array/diff.cc
${LIBRARY_DIR}/array/util.cc
${LIBRARY_DIR}/array/validate.cc
${LIBRARY_DIR}/csv/converter.cc
${LIBRARY_DIR}/compute/api_scalar.cc
${LIBRARY_DIR}/compute/api_vector.cc
${LIBRARY_DIR}/compute/cast.cc
${LIBRARY_DIR}/compute/exec.cc
${LIBRARY_DIR}/compute/function.cc
${LIBRARY_DIR}/compute/kernel.cc
${LIBRARY_DIR}/compute/registry.cc
${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc
${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc
${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc
${LIBRARY_DIR}/compute/kernels/codegen_internal.cc
${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc
${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc
${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc
${LIBRARY_DIR}/compute/kernels/scalar_compare.cc
${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc
${LIBRARY_DIR}/compute/kernels/scalar_nested.cc
${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc
${LIBRARY_DIR}/compute/kernels/scalar_string.cc
${LIBRARY_DIR}/compute/kernels/scalar_validity.cc
${LIBRARY_DIR}/compute/kernels/vector_hash.cc
${LIBRARY_DIR}/compute/kernels/vector_nested.cc
${LIBRARY_DIR}/compute/kernels/vector_selection.cc
${LIBRARY_DIR}/compute/kernels/vector_sort.cc
${LIBRARY_DIR}/compute/kernels/util_internal.cc
${LIBRARY_DIR}/csv/chunker.cc
${LIBRARY_DIR}/csv/column_builder.cc
${LIBRARY_DIR}/csv/column_decoder.cc
${LIBRARY_DIR}/csv/converter.cc
${LIBRARY_DIR}/csv/options.cc
${LIBRARY_DIR}/csv/parser.cc
${LIBRARY_DIR}/csv/reader.cc
${LIBRARY_DIR}/csv/column_decoder.cc
${LIBRARY_DIR}/ipc/dictionary.cc
${LIBRARY_DIR}/ipc/feather.cc
@ -202,14 +237,25 @@ set(ARROW_SRCS
${LIBRARY_DIR}/ipc/writer.cc
${LIBRARY_DIR}/io/buffered.cc
${LIBRARY_DIR}/io/caching.cc
${LIBRARY_DIR}/io/compressed.cc
${LIBRARY_DIR}/io/file.cc
${LIBRARY_DIR}/io/interfaces.cc
${LIBRARY_DIR}/io/memory.cc
${LIBRARY_DIR}/io/slow.cc
${LIBRARY_DIR}/tensor/coo_converter.cc
${LIBRARY_DIR}/tensor/csf_converter.cc
${LIBRARY_DIR}/tensor/csx_converter.cc
${LIBRARY_DIR}/util/basic_decimal.cc
${LIBRARY_DIR}/util/bit_block_counter.cc
${LIBRARY_DIR}/util/bit_run_reader.cc
${LIBRARY_DIR}/util/bit_util.cc
${LIBRARY_DIR}/util/bitmap.cc
${LIBRARY_DIR}/util/bitmap_builders.cc
${LIBRARY_DIR}/util/bitmap_ops.cc
${LIBRARY_DIR}/util/bpacking.cc
${LIBRARY_DIR}/util/compression.cc
${LIBRARY_DIR}/util/compression_lz4.cc
${LIBRARY_DIR}/util/compression_snappy.cc
@ -217,8 +263,12 @@ set(ARROW_SRCS
${LIBRARY_DIR}/util/compression_zstd.cc
${LIBRARY_DIR}/util/cpu_info.cc
${LIBRARY_DIR}/util/decimal.cc
${LIBRARY_DIR}/util/delimiting.cc
${LIBRARY_DIR}/util/formatting.cc
${LIBRARY_DIR}/util/future.cc
${LIBRARY_DIR}/util/int_util.cc
${LIBRARY_DIR}/util/io_util.cc
${LIBRARY_DIR}/util/iterator.cc
${LIBRARY_DIR}/util/key_value_metadata.cc
${LIBRARY_DIR}/util/logging.cc
${LIBRARY_DIR}/util/memory.cc
@ -226,27 +276,15 @@ set(ARROW_SRCS
${LIBRARY_DIR}/util/string.cc
${LIBRARY_DIR}/util/task_group.cc
${LIBRARY_DIR}/util/thread_pool.cc
${LIBRARY_DIR}/util/time.cc
${LIBRARY_DIR}/util/trie.cc
${LIBRARY_DIR}/util/utf8.cc
${LIBRARY_DIR}/util/future.cc
${LIBRARY_DIR}/util/formatting.cc
${LIBRARY_DIR}/util/parsing.cc
${LIBRARY_DIR}/util/time.cc
${LIBRARY_DIR}/util/delimiting.cc
${LIBRARY_DIR}/util/iterator.cc
${LIBRARY_DIR}/util/value_parsing.cc
${LIBRARY_DIR}/vendored/base64.cpp
${ORC_SRCS}
)
set(ARROW_SRCS ${ARROW_SRCS}
${LIBRARY_DIR}/compute/context.cc
${LIBRARY_DIR}/compute/kernels/boolean.cc
${LIBRARY_DIR}/compute/kernels/cast.cc
${LIBRARY_DIR}/compute/kernels/hash.cc
${LIBRARY_DIR}/compute/kernels/util_internal.cc
)
if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
set(ARROW_WITH_SNAPPY 1)
endif ()
@ -289,7 +327,8 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY)
add_dependencies(${ARROW_LIBRARY} protoc)
endif ()
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src)
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/cpp/src)
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY})
target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4)
if (ARROW_WITH_SNAPPY)
@ -319,19 +358,26 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet)
set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated)
# arrow/cpp/src/parquet/CMakeLists.txt
set(PARQUET_SRCS
${LIBRARY_DIR}/arrow/path_internal.cc
${LIBRARY_DIR}/arrow/reader.cc
${LIBRARY_DIR}/arrow/reader_internal.cc
${LIBRARY_DIR}/arrow/schema.cc
${LIBRARY_DIR}/arrow/schema_internal.cc
${LIBRARY_DIR}/arrow/writer.cc
${LIBRARY_DIR}/arrow/path_internal.cc
${LIBRARY_DIR}/bloom_filter.cc
${LIBRARY_DIR}/column_reader.cc
${LIBRARY_DIR}/column_scanner.cc
${LIBRARY_DIR}/column_writer.cc
${LIBRARY_DIR}/deprecated_io.cc
${LIBRARY_DIR}/encoding.cc
${LIBRARY_DIR}/encryption.cc
${LIBRARY_DIR}/encryption_internal.cc
${LIBRARY_DIR}/file_reader.cc
${LIBRARY_DIR}/file_writer.cc
${LIBRARY_DIR}/internal_file_decryptor.cc
${LIBRARY_DIR}/internal_file_encryptor.cc
${LIBRARY_DIR}/level_conversion.cc
${LIBRARY_DIR}/level_comparison.cc
${LIBRARY_DIR}/metadata.cc
${LIBRARY_DIR}/murmur3.cc
${LIBRARY_DIR}/platform.cc
@ -340,10 +386,6 @@ set(PARQUET_SRCS
${LIBRARY_DIR}/schema.cc
${LIBRARY_DIR}/statistics.cc
${LIBRARY_DIR}/types.cc
${LIBRARY_DIR}/encryption.cc
${LIBRARY_DIR}/encryption_internal.cc
${LIBRARY_DIR}/internal_file_decryptor.cc
${LIBRARY_DIR}/internal_file_encryptor.cc
${GEN_LIBRARY_DIR}/parquet_constants.cpp
${GEN_LIBRARY_DIR}/parquet_types.cpp

View File

@ -1,26 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#define ARROW_VERSION_MAJOR
#define ARROW_VERSION_MINOR
#define ARROW_VERSION_PATCH
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
#define ARROW_SO_VERSION ""
#define ARROW_FULL_SO_VERSION ""
/* #undef GRPCPP_PP_INCLUDE */

View File

@ -22,8 +22,8 @@
#define PARQUET_VERSION_MINOR 5
#define PARQUET_VERSION_PATCH 1
#define PARQUET_SO_VERSION 0
#define PARQUET_FULL_SO_VERSION 0.17
#define PARQUET_SO_VERSION "200"
#define PARQUET_FULL_SO_VERSION "200.0.0"
// define the parquet created by version
#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"

View File

@ -363,7 +363,7 @@ endif ()
if (USE_PARQUET)
dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY})
if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
if (NOT USE_INTERNAL_PARQUET_LIBRARY)
dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
if (USE_STATIC_LIBRARIES)
dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY})

View File

@ -62,9 +62,9 @@ void ArrowBlockOutputFormat::prepareWriter(const std::shared_ptr<arrow::Schema>
// TODO: should we use arrow::ipc::IpcOptions::alignment?
if (stream)
writer_status = arrow::ipc::NewStreamWriter(arrow_ostream.get(), schema);
writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema);
else
writer_status = arrow::ipc::NewFileWriter(arrow_ostream.get(), schema);
writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema);
if (!writer_status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,

View File

@ -1,54 +0,0 @@
#!/usr/bin/env perl
package parquet_create_table_columns;
use strict;
no warnings 'experimental';
use feature 'signatures';
use JSON::XS;
#use Data::Dumper;
sub file_read($file) {
open my $f, '<', $file or return;
local $/ = undef;
my $ret = <$f>;
close $f;
return $ret;
}
our $type_parquet_logical_to_clickhouse = {
DECIMAL => 'Decimal128(1)',
TIMESTAMP_MICROS => 'DateTime',
TIMESTAMP_MILLIS => 'DateTime',
};
our $type_parquet_physical_to_clickhouse = {
BOOLEAN => 'UInt8',
INT32 => 'Int32',
INT64 => 'Int64',
FLOAT => 'Float32',
DOUBLE => 'Float64',
BYTE_ARRAY => 'String',
FIXED_LEN_BYTE_ARRAY => 'String', # Maybe FixedString?
INT96 => 'Int64', # TODO!
};
sub columns ($json) {
my @list;
my %uniq;
for my $column (@{$json->{Columns}}) {
#warn Data::Dumper::Dumper $column;
my $name = $column->{'Name'};
my $type = $type_parquet_logical_to_clickhouse->{$column->{'LogicalType'}} || $type_parquet_physical_to_clickhouse->{$column->{'PhysicalType'}};
unless ($type) {
warn "Unknown type [$column->{'PhysicalType'}:$column->{'LogicalType'}] of column [$name]";
}
$type = "Nullable($type)";
$name .= $column->{'Id'} if $uniq{$name}++; # Names can be non-unique
push @list, {name => $name, type => $type};
}
print join ', ', map {"`$_->{name}` $_->{type}"} @list;
}
sub columns_file ($file) {
return columns(JSON::XS::decode_json(file_read($file)));
}
columns_file(shift) unless caller;

View File

@ -13,134 +13,220 @@
=== Try load data from alltypes_plain.snappy.parquet
6 1 0 0 0 0 0 0 04/01/09 0 1238544000
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
=== Try load data from binary.parquet
\0







\b
\t
\n
=== Try load data from byte_array_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from datapage_v2.snappy.parquet
Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: data for INSERT was parsed from stdin
=== Try load data from dict-page-offset-zero.parquet
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
1552
=== Try load data from fixed_length_decimal.parquet
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from fixed_length_decimal_1.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from fixed_length_decimal_legacy.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from hadoop_lz4_compressed.parquet
1593604800 abc 42
1593604800 def 7.7
1593604801 abc 42.125
1593604801 def 7.7
=== Try load data from int32_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from int64_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
1.00
2.00
3.00
4.00
5.00
6.00
7.00
8.00
9.00
10.00
11.00
12.00
13.00
14.00
15.00
16.00
17.00
18.00
19.00
20.00
21.00
22.00
23.00
24.00
=== Try load data from list_columns.parquet
Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
=== Try load data from nation.dict-malformed.parquet
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
@ -168,23 +254,25 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
=== Try load data from nested_lists.snappy.parquet
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
=== Try load data from nested_maps.snappy.parquet
Code: 33. DB::Ex---tion: Error while reading Parquet data: NotImplemented: Reading lists of structs from Parquet files not yet supported: key_value: list<key_value: struct<key: string not null, value: struct<key_value: list<key_value: struct<key: int32 not null, value: bool not null> not null> not null>> not null> not null: data for INSERT was parsed from stdin
Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
=== Try load data from non_hadoop_lz4_compressed.parquet
1593604800 abc 42
1593604800 def 7.7
1593604801 abc 42.125
1593604801 def 7.7
=== Try load data from nonnullable.impala.parquet
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nullable.impala.parquet
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
=== Try load data from nulls.snappy.parquet
Code: 8. DB::Ex---tion: Column "b_c_int" is not presented in input data: data for INSERT was parsed from stdin
=== Try load data from repeated_no_annotation.parquet
Code: 8. DB::Ex---tion: Column "number" is not presented in input data: data for INSERT was parsed from stdin
Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
=== Try load data from single_nan.parquet
\N
=== Try load data from userdata1.parquet
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV

View File

@ -5,8 +5,6 @@
# TODO: Add more files.
#
# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl
# Also 5 sample files from
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
# ...
@ -19,38 +17,46 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CUR_DIR"/../shell_config.sh
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../..
[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../..
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../..
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../..
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../..
DATA_DIR=$CUR_DIR/data_parquet
[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader
# To update:
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
# why? repeated_no_annotation.parquet
# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue.
# There is failure due parsing nested arrays or nested maps with NULLs:
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort); do
# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled:
# debug:
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type)
# release:
# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin
for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do
echo === Try load data from "$NAME"
JSON=$DATA_DIR/$NAME.json
COLUMNS_FILE=$DATA_DIR/$NAME.columns
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
[ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$BUILD_DIR"/contrib/arrow-cmake/parquet-reader --json "$DATA_DIR"/"$NAME" > "$JSON"
[ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.pl "$JSON" > "$COLUMNS_FILE"
[ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON"
[ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE"
# Debug only:
# [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
# [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
#COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue
# COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue
COLUMNS=$(cat "$COLUMNS_FILE") || continue
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory"
# Some files is broken, exception is ok.
# Some files contain unsupported data structures, exception is ok.
cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100"

Binary file not shown.

View File

@ -0,0 +1 @@
`foo` Nullable(String)

View File

@ -1 +1 @@
`value` Nullable(Decimal128(1))
`value` Nullable(Decimal(4, 2))

View File

@ -1 +1 @@
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `element` Nullable(Int32)
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Nullable(Int32)

View File

@ -0,0 +1 @@
`l_partkey` Nullable(Int32)

View File

@ -0,0 +1 @@
`value` Nullable(Decimal(25, 2))

View File

@ -1 +1 @@
`value` Nullable(Decimal128(1))
`value` Nullable(Decimal(25, 2))

View File

@ -1 +1 @@
`value` Nullable(Decimal128(1))
`value` Nullable(Decimal(13, 2))

View File

@ -0,0 +1 @@
`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)

View File

@ -1 +1 @@
`value` Nullable(Decimal128(1))
`value` Nullable(Decimal(4, 2))

View File

@ -1 +1 @@
`value` Nullable(Decimal128(1))
`value` Nullable(Decimal(10, 2))

View File

@ -0,0 +1 @@
`int64_list` Nullable(Int64), `utf8_list` Nullable(String)

View File

@ -1 +1 @@
`element` Nullable(String), `b` Nullable(Int32)
`a` Nullable(String), `b` Nullable(Int32)

View File

@ -1 +1 @@
`key` Nullable(String), `key1` Nullable(Int32), `value` Nullable(UInt8), `b` Nullable(Int32), `c` Nullable(Float64)
`a` Tuple(Nullable(String), Nullable(Int32), Nullable(UInt8)), `b` Nullable(Int32), `c` Nullable(Float64)

View File

@ -0,0 +1 @@
`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)

View File

@ -1 +1 @@
`ID` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `a` Nullable(Int32), `element8` Nullable(Int32), `e` Nullable(Int32), `f` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
`ID` Nullable(Int64), `Int_Array` Nullable(Int32), `int_array_array` Nullable(Int32), `Int_Map` Tuple(Nullable(String), Nullable(Int32)), `int_map_array` Tuple(Nullable(String), Nullable(Int32)), `nested_Struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))

View File

@ -1 +1 @@
`id` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `A` Nullable(Int32), `element8` Nullable(Int32), `E` Nullable(Int32), `F` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
`id` Nullable(Int64), `int_array` Nullable(Int32), `int_array_Array` Nullable(Int32), `int_map` Tuple(Nullable(String), Nullable(Int32)), `int_Map_Array` Tuple(Nullable(String), Nullable(Int32)), `nested_struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))

View File

@ -1 +1 @@
`b_c_int` Nullable(Int32)
`b_struct` Nullable(Int32)

View File

@ -1 +0,0 @@
`id` Nullable(Int32), `number` Nullable(Int64), `kind` Nullable(String)

View File

@ -0,0 +1 @@
`mycol` Nullable(Float64)

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
import json
import sys
TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE = {
"TIMESTAMP_MICROS": "DateTime",
"TIMESTAMP_MILLIS": "DateTime",
"UTF8": "String",
}
TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE = {
"BOOLEAN": "UInt8",
"INT32": "Int32",
"INT64": "Int64",
"FLOAT": "Float32",
"DOUBLE": "Float64",
"BYTE_ARRAY": "String",
"INT96": "Int64", # TODO!
}
def read_file(filename):
with open(filename, "rb") as f:
return f.read().decode("raw_unicode_escape")
def get_column_name(column):
return column["Name"].split(".", 1)[0]
def resolve_clickhouse_column_type(column):
column_name = get_column_name(column)
logical_type = column.get("LogicalType", {})
converted_type = column.get("ConvertedType", "").upper()
physical_type = column.get("PhysicalType", "").upper()
if logical_type and logical_type.get("Type", "").upper() == "DECIMAL":
precision = int(logical_type["precision"])
scale = int(logical_type["scale"])
if precision < 1 or precision > 76:
raise RuntimeError("Column {} has invalid Decimal precision {}".format(column_name, precision))
if precision > 38:
raise RuntimeError("Column {} has unsupported Decimal precision {}".format(column_name, precision))
if scale < 0 or scale > precision:
raise RuntimeError("Column {} has invalid Decimal scale {} for precision {}".format(column_name, scale, precision))
return "Decimal({}, {})".format(precision, scale)
if converted_type and converted_type != "NONE":
result_type = TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE.get(converted_type)
if result_type:
return result_type
raise RuntimeError("Column {} has unknown ConvertedType: {}".format(column_name, converted_type))
if physical_type and physical_type != "NONE":
result_type = TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE.get(physical_type)
if result_type:
return result_type
raise RuntimeError("Column {} has unknown PhysicalType: {}".format(column_name, physical_type))
raise RuntimeError("Column {} has invalid types: ConvertedType={}, PhysicalType={}".format(column_name, converted_type, physical_type))
def dump_columns(obj):
descr_by_column_name = {}
columns_descr = []
for column in obj["Columns"]:
column_name = get_column_name(column)
column_type = resolve_clickhouse_column_type(column)
result_type = "Nullable({})".format(column_type)
if column_name in descr_by_column_name:
descr = descr_by_column_name[column_name]
descr["types"].append(result_type)
else:
descr = {
"name": column_name,
"types": [result_type],
}
descr_by_column_name[column_name] = descr
columns_descr.append(descr)
# Make tuples from nested types. CH Server doesn't support such Arrow type but it makes Server Exceptions more relevant.
def _format_type(types):
if len(types) == 1:
return types[0]
else:
return "Tuple({})".format(", ".join(types))
print(", ".join(map(lambda descr: "`{}` {}".format(descr["name"], _format_type(descr["types"])), columns_descr)))
def dump_columns_from_file(filename):
dump_columns(json.loads(read_file(filename), strict=False))
if __name__ == "__main__":
filename = sys.argv[1]
dump_columns_from_file(filename)