Merge pull request #16664 from FawnD2/switch-upstream-for-arrow-submodule

Switch upstream repo for Arrow submodule
2024-11-24 16:42:05 +00:00 · 2020-12-22 10:55:23 +03:00 · 2020-12-22 10:55:23 +03:00 · a4b0d9ba4c
commit a4b0d9ba4c
parent f0dc3ec152 febf63aec3
53 changed files with 448 additions and 336 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -53,7 +53,8 @@
 	url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
 [submodule "contrib/arrow"]
 	path = contrib/arrow
-	url = https://github.com/apache/arrow
+	url = https://github.com/ClickHouse-Extras/arrow
+	branch = clickhouse-arrow-2.0.0
 [submodule "contrib/thrift"]
 	path = contrib/thrift
 	url = https://github.com/apache/thrift.git
--- a/cmake/find/parquet.cmake
+++ b/cmake/find/parquet.cmake
@ -141,11 +141,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
    else()
    set(USE_INTERNAL_PARQUET_LIBRARY 1)

-    if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
-        set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src")
-        set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src)
-    endif()
-
    if(MAKE_STATIC_LIBRARIES)
        set(FLATBUFFERS_LIBRARY flatbuffers)
        set(ARROW_LIBRARY arrow_static)
@ -155,9 +150,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
        set(FLATBUFFERS_LIBRARY flatbuffers_shared)
        set(ARROW_LIBRARY arrow_shared)
        set(PARQUET_LIBRARY parquet_shared)
-        if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
-            list(APPEND PARQUET_LIBRARY boost::regex)
-        endif()
        set(THRIFT_LIBRARY thrift)
    endif()

--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -163,51 +163,21 @@ if(USE_INTERNAL_SNAPPY_LIBRARY)
 endif()

 if (USE_INTERNAL_PARQUET_LIBRARY)
-if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
    # We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
-    # But this mode can be used for updating auto-generated parquet files:
-    # cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0
-    # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/
+    # But you can update auto-generated parquet files manually:
+    # cd {BUILD_DIR}/contrib/arrow/cpp/src/parquet && mkdir -p build && cd build
+    # cmake .. -DARROW_COMPUTE=ON -DARROW_PARQUET=ON -DARROW_SIMD_LEVEL=NONE -DARROW_VERBOSE_THIRDPARTY_BUILD=ON
+    #          -DARROW_BUILD_SHARED=1 -DARROW_BUILD_UTILITIES=OFF -DARROW_BUILD_INTEGRATION=OFF
+    #          -DBoost_FOUND=1 -DARROW_TEST_LINKAGE="shared"
+    # make -j8
+    # copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> {BUILD_DIR}/contrib/arrow-cmake/cpp/src/parquet/

    # Also useful parquet reader:
-    # cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8
-    # contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet
+    # cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build
+    # cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1
+    # make -j8
+    # {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet

-    set (ARROW_COMPUTE ON CACHE INTERNAL "")
-    set (ARROW_PARQUET ON CACHE INTERNAL "")
-    set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "")
-    set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "")
-    set (ARROW_BUILD_UTILITIES OFF CACHE INTERNAL "")
-    set (ARROW_BUILD_INTEGRATION OFF CACHE INTERNAL "")
-    set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "")
-    set (Boost_FOUND 1 CACHE INTERNAL "")
-    if (MAKE_STATIC_LIBRARIES)
-        set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "")
-        set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "")
-        set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "")
-    else ()
-        set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "")
-        set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "")
-    endif ()
-
-    if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO")
-        set (_save_build_type ${CMAKE_BUILD_TYPE})
-        set (CMAKE_BUILD_TYPE Release)
-        string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-    endif ()
-
-    # Because Arrow uses CMAKE_SOURCE_DIR as a project path
-    # Hopefully will be fixed in https://github.com/apache/arrow/pull/2676
-    set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules")
-    add_subdirectory (arrow/cpp)
-
-    if (_save_build_type)
-        set (CMAKE_BUILD_TYPE ${_save_build_type})
-        unset (_save_build_type)
-        string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-    endif ()
-
-else()
    add_subdirectory(arrow-cmake)

    # The library is large - avoid bloat.
@ -215,7 +185,6 @@ else()
    target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0)
    target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0)
 endif()
-endif()

 if (USE_INTERNAL_AVRO_LIBRARY)
    add_subdirectory(avro-cmake)
--- a/contrib/arrow
+++ b/contrib/arrow
@ -1 +1 @@
-Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b
+Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -144,15 +144,16 @@ set(ORC_SRCS

 set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)

-configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/arrow/util/config.h")
+configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h")

 # arrow/cpp/src/arrow/CMakeLists.txt
 set(ARROW_SRCS
-        ${LIBRARY_DIR}/array.cc
        ${LIBRARY_DIR}/buffer.cc
-        ${LIBRARY_DIR}/device.cc
        ${LIBRARY_DIR}/builder.cc
+        ${LIBRARY_DIR}/chunked_array.cc
        ${LIBRARY_DIR}/compare.cc
+        ${LIBRARY_DIR}/datum.cc
+        ${LIBRARY_DIR}/device.cc
        ${LIBRARY_DIR}/extension_type.cc
        ${LIBRARY_DIR}/memory_pool.cc
        ${LIBRARY_DIR}/pretty_print.cc
@ -167,11 +168,12 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/type.cc
        ${LIBRARY_DIR}/visitor.cc

-        ${LIBRARY_DIR}/tensor/coo_converter.cc
-        ${LIBRARY_DIR}/tensor/csc_converter.cc
-        ${LIBRARY_DIR}/tensor/csf_converter.cc
-        ${LIBRARY_DIR}/tensor/csr_converter.cc
-
+        ${LIBRARY_DIR}/array/array_base.cc
+        ${LIBRARY_DIR}/array/array_binary.cc
+        ${LIBRARY_DIR}/array/array_decimal.cc
+        ${LIBRARY_DIR}/array/array_dict.cc
+        ${LIBRARY_DIR}/array/array_nested.cc
+        ${LIBRARY_DIR}/array/array_primitive.cc
        ${LIBRARY_DIR}/array/builder_adaptive.cc
        ${LIBRARY_DIR}/array/builder_base.cc
        ${LIBRARY_DIR}/array/builder_binary.cc
@ -181,17 +183,50 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/array/builder_primitive.cc
        ${LIBRARY_DIR}/array/builder_union.cc
        ${LIBRARY_DIR}/array/concatenate.cc
-        ${LIBRARY_DIR}/array/dict_internal.cc
+        ${LIBRARY_DIR}/array/data.cc
        ${LIBRARY_DIR}/array/diff.cc
+        ${LIBRARY_DIR}/array/util.cc
        ${LIBRARY_DIR}/array/validate.cc

-        ${LIBRARY_DIR}/csv/converter.cc
+        ${LIBRARY_DIR}/compute/api_scalar.cc
+        ${LIBRARY_DIR}/compute/api_vector.cc
+        ${LIBRARY_DIR}/compute/cast.cc
+        ${LIBRARY_DIR}/compute/exec.cc
+        ${LIBRARY_DIR}/compute/function.cc
+        ${LIBRARY_DIR}/compute/kernel.cc
+        ${LIBRARY_DIR}/compute/registry.cc
+
+        ${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc
+        ${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc
+        ${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc
+        ${LIBRARY_DIR}/compute/kernels/codegen_internal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_compare.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_string.cc
+        ${LIBRARY_DIR}/compute/kernels/scalar_validity.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_hash.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_nested.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_selection.cc
+        ${LIBRARY_DIR}/compute/kernels/vector_sort.cc
+        ${LIBRARY_DIR}/compute/kernels/util_internal.cc
+
        ${LIBRARY_DIR}/csv/chunker.cc
        ${LIBRARY_DIR}/csv/column_builder.cc
+        ${LIBRARY_DIR}/csv/column_decoder.cc
+        ${LIBRARY_DIR}/csv/converter.cc
        ${LIBRARY_DIR}/csv/options.cc
        ${LIBRARY_DIR}/csv/parser.cc
        ${LIBRARY_DIR}/csv/reader.cc
-        ${LIBRARY_DIR}/csv/column_decoder.cc

        ${LIBRARY_DIR}/ipc/dictionary.cc
        ${LIBRARY_DIR}/ipc/feather.cc
@ -202,14 +237,25 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/ipc/writer.cc

        ${LIBRARY_DIR}/io/buffered.cc
+        ${LIBRARY_DIR}/io/caching.cc
        ${LIBRARY_DIR}/io/compressed.cc
        ${LIBRARY_DIR}/io/file.cc
        ${LIBRARY_DIR}/io/interfaces.cc
        ${LIBRARY_DIR}/io/memory.cc
        ${LIBRARY_DIR}/io/slow.cc

+        ${LIBRARY_DIR}/tensor/coo_converter.cc
+        ${LIBRARY_DIR}/tensor/csf_converter.cc
+        ${LIBRARY_DIR}/tensor/csx_converter.cc
+
        ${LIBRARY_DIR}/util/basic_decimal.cc
+        ${LIBRARY_DIR}/util/bit_block_counter.cc
+        ${LIBRARY_DIR}/util/bit_run_reader.cc
        ${LIBRARY_DIR}/util/bit_util.cc
+        ${LIBRARY_DIR}/util/bitmap.cc
+        ${LIBRARY_DIR}/util/bitmap_builders.cc
+        ${LIBRARY_DIR}/util/bitmap_ops.cc
+        ${LIBRARY_DIR}/util/bpacking.cc
        ${LIBRARY_DIR}/util/compression.cc
        ${LIBRARY_DIR}/util/compression_lz4.cc
        ${LIBRARY_DIR}/util/compression_snappy.cc
@ -217,8 +263,12 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/util/compression_zstd.cc
        ${LIBRARY_DIR}/util/cpu_info.cc
        ${LIBRARY_DIR}/util/decimal.cc
+        ${LIBRARY_DIR}/util/delimiting.cc
+        ${LIBRARY_DIR}/util/formatting.cc
+        ${LIBRARY_DIR}/util/future.cc
        ${LIBRARY_DIR}/util/int_util.cc
        ${LIBRARY_DIR}/util/io_util.cc
+        ${LIBRARY_DIR}/util/iterator.cc
        ${LIBRARY_DIR}/util/key_value_metadata.cc
        ${LIBRARY_DIR}/util/logging.cc
        ${LIBRARY_DIR}/util/memory.cc
@ -226,27 +276,15 @@ set(ARROW_SRCS
        ${LIBRARY_DIR}/util/string.cc
        ${LIBRARY_DIR}/util/task_group.cc
        ${LIBRARY_DIR}/util/thread_pool.cc
+        ${LIBRARY_DIR}/util/time.cc
        ${LIBRARY_DIR}/util/trie.cc
        ${LIBRARY_DIR}/util/utf8.cc
-        ${LIBRARY_DIR}/util/future.cc
-        ${LIBRARY_DIR}/util/formatting.cc
-        ${LIBRARY_DIR}/util/parsing.cc
-        ${LIBRARY_DIR}/util/time.cc
-        ${LIBRARY_DIR}/util/delimiting.cc
-        ${LIBRARY_DIR}/util/iterator.cc
+        ${LIBRARY_DIR}/util/value_parsing.cc

        ${LIBRARY_DIR}/vendored/base64.cpp
        ${ORC_SRCS}
        )

-set(ARROW_SRCS ${ARROW_SRCS}
-        ${LIBRARY_DIR}/compute/context.cc
-        ${LIBRARY_DIR}/compute/kernels/boolean.cc
-        ${LIBRARY_DIR}/compute/kernels/cast.cc
-        ${LIBRARY_DIR}/compute/kernels/hash.cc
-        ${LIBRARY_DIR}/compute/kernels/util_internal.cc
-        )
-
 if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
    set(ARROW_WITH_SNAPPY 1)
 endif ()
@ -289,7 +327,8 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY)
    add_dependencies(${ARROW_LIBRARY} protoc)
 endif ()

-target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
+target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src)
+target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/cpp/src)
 target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY})
 target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4)
 if (ARROW_WITH_SNAPPY)
@ -319,19 +358,26 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet)
 set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated)
 # arrow/cpp/src/parquet/CMakeLists.txt
 set(PARQUET_SRCS
+        ${LIBRARY_DIR}/arrow/path_internal.cc
        ${LIBRARY_DIR}/arrow/reader.cc
        ${LIBRARY_DIR}/arrow/reader_internal.cc
        ${LIBRARY_DIR}/arrow/schema.cc
+        ${LIBRARY_DIR}/arrow/schema_internal.cc
        ${LIBRARY_DIR}/arrow/writer.cc
-        ${LIBRARY_DIR}/arrow/path_internal.cc
        ${LIBRARY_DIR}/bloom_filter.cc
        ${LIBRARY_DIR}/column_reader.cc
        ${LIBRARY_DIR}/column_scanner.cc
        ${LIBRARY_DIR}/column_writer.cc
        ${LIBRARY_DIR}/deprecated_io.cc
        ${LIBRARY_DIR}/encoding.cc
+        ${LIBRARY_DIR}/encryption.cc
+        ${LIBRARY_DIR}/encryption_internal.cc
        ${LIBRARY_DIR}/file_reader.cc
        ${LIBRARY_DIR}/file_writer.cc
+        ${LIBRARY_DIR}/internal_file_decryptor.cc
+        ${LIBRARY_DIR}/internal_file_encryptor.cc
+        ${LIBRARY_DIR}/level_conversion.cc
+        ${LIBRARY_DIR}/level_comparison.cc
        ${LIBRARY_DIR}/metadata.cc
        ${LIBRARY_DIR}/murmur3.cc
        ${LIBRARY_DIR}/platform.cc
@ -340,10 +386,6 @@ set(PARQUET_SRCS
        ${LIBRARY_DIR}/schema.cc
        ${LIBRARY_DIR}/statistics.cc
        ${LIBRARY_DIR}/types.cc
-        ${LIBRARY_DIR}/encryption.cc
-        ${LIBRARY_DIR}/encryption_internal.cc
-        ${LIBRARY_DIR}/internal_file_decryptor.cc
-        ${LIBRARY_DIR}/internal_file_encryptor.cc

        ${GEN_LIBRARY_DIR}/parquet_constants.cpp
        ${GEN_LIBRARY_DIR}/parquet_types.cpp
--- a/contrib/arrow-cmake/cpp/src/arrow/util/config.h
+++ b/contrib/arrow-cmake/cpp/src/arrow/util/config.h
@ -1,26 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#define ARROW_VERSION_MAJOR 
-#define ARROW_VERSION_MINOR 
-#define ARROW_VERSION_PATCH 
-#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
-
-#define ARROW_SO_VERSION ""
-#define ARROW_FULL_SO_VERSION ""
-
-/* #undef GRPCPP_PP_INCLUDE */
--- a/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h
+++ b/contrib/arrow-cmake/cpp/src/parquet/parquet_version.h
@ -22,8 +22,8 @@
 #define PARQUET_VERSION_MINOR 5
 #define PARQUET_VERSION_PATCH 1

-#define PARQUET_SO_VERSION 0
-#define PARQUET_FULL_SO_VERSION 0.17
+#define PARQUET_SO_VERSION "200"
+#define PARQUET_FULL_SO_VERSION "200.0.0"

 // define the parquet created by version
 #define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -363,7 +363,7 @@ endif ()

 if (USE_PARQUET)
    dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY})
-    if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
+    if (NOT USE_INTERNAL_PARQUET_LIBRARY)
        dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
        if (USE_STATIC_LIBRARIES)
            dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY})
--- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp
@ -62,9 +62,9 @@ void ArrowBlockOutputFormat::prepareWriter(const std::shared_ptr<arrow::Schema>

    // TODO: should we use arrow::ipc::IpcOptions::alignment?
    if (stream)
-        writer_status = arrow::ipc::NewStreamWriter(arrow_ostream.get(), schema);
+        writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema);
    else
-        writer_status = arrow::ipc::NewFileWriter(arrow_ostream.get(), schema);
+        writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema);

    if (!writer_status.ok())
        throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
--- a/tests/queries/0_stateless/00900_parquet_create_table_columns.pl
+++ b/tests/queries/0_stateless/00900_parquet_create_table_columns.pl
@ -1,54 +0,0 @@
-#!/usr/bin/env perl
-package parquet_create_table_columns;
-use strict;
-no warnings 'experimental';
-use feature 'signatures';
-use JSON::XS;
-#use Data::Dumper;
-
-sub file_read($file) {
-    open my $f, '<', $file or return;
-    local $/ = undef;
-    my $ret = <$f>;
-    close $f;
-    return $ret;
-}
-
-our $type_parquet_logical_to_clickhouse = {
-    DECIMAL    => 'Decimal128(1)',
-    TIMESTAMP_MICROS => 'DateTime',
-    TIMESTAMP_MILLIS => 'DateTime',
-};
-our $type_parquet_physical_to_clickhouse = {
-    BOOLEAN    => 'UInt8',
-    INT32      => 'Int32',
-    INT64      => 'Int64',
-    FLOAT      => 'Float32',
-    DOUBLE     => 'Float64',
-    BYTE_ARRAY => 'String',
-    FIXED_LEN_BYTE_ARRAY => 'String', # Maybe FixedString?
-    INT96      => 'Int64',     # TODO!
-};
-
-sub columns ($json) {
-    my @list;
-    my %uniq;
-    for my $column (@{$json->{Columns}}) {
-        #warn Data::Dumper::Dumper $column;
-        my $name = $column->{'Name'};
-        my $type = $type_parquet_logical_to_clickhouse->{$column->{'LogicalType'}} || $type_parquet_physical_to_clickhouse->{$column->{'PhysicalType'}};
-        unless ($type) {
-            warn "Unknown type [$column->{'PhysicalType'}:$column->{'LogicalType'}] of column [$name]";
-        }
-        $type = "Nullable($type)";
-        $name .= $column->{'Id'} if $uniq{$name}++; # Names can be non-unique
-        push @list, {name => $name, type => $type};
-    }
-    print join ', ', map {"`$_->{name}` $_->{type}"} @list;
-}
-
-sub columns_file ($file) {
-    return columns(JSON::XS::decode_json(file_read($file)));
-}
-
-columns_file(shift) unless caller;
--- a/tests/queries/0_stateless/00900_parquet_load.reference
+++ b/tests/queries/0_stateless/00900_parquet_load.reference
@ -13,134 +13,220 @@
 === Try load data from alltypes_plain.snappy.parquet
 6	1	0	0	0	0	0	0	04/01/09	0	1238544000
 7	0	1	1	1	10	1.1	10.1	04/01/09	1	1238544060
+=== Try load data from binary.parquet
+\0
+
+
+
+
+
+
+
+\b
+\t
+\n
+
 === Try load data from byte_array_decimal.parquet
-1.0
-2.0
-3.0
-4.0
-5.0
-6.0
-7.0
-8.0
-9.0
-10.0
-11.0
-12.0
-13.0
-14.0
-15.0
-16.0
-17.0
-18.0
-19.0
-20.0
-21.0
-22.0
-23.0
-24.0
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
 === Try load data from datapage_v2.snappy.parquet
 Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: data for INSERT was parsed from stdin

+=== Try load data from dict-page-offset-zero.parquet
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+1552
+=== Try load data from fixed_length_decimal.parquet
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
 === Try load data from fixed_length_decimal_1.parquet
-1.0
-2.0
-3.0
-4.0
-5.0
-6.0
-7.0
-8.0
-9.0
-10.0
-11.0
-12.0
-13.0
-14.0
-15.0
-16.0
-17.0
-18.0
-19.0
-20.0
-21.0
-22.0
-23.0
-24.0
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
 === Try load data from fixed_length_decimal_legacy.parquet
-1.0
-2.0
-3.0
-4.0
-5.0
-6.0
-7.0
-8.0
-9.0
-10.0
-11.0
-12.0
-13.0
-14.0
-15.0
-16.0
-17.0
-18.0
-19.0
-20.0
-21.0
-22.0
-23.0
-24.0
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
+=== Try load data from hadoop_lz4_compressed.parquet
+1593604800	abc	42
+1593604800	def	7.7
+1593604801	abc	42.125
+1593604801	def	7.7
 === Try load data from int32_decimal.parquet
-1.0
-2.0
-3.0
-4.0
-5.0
-6.0
-7.0
-8.0
-9.0
-10.0
-11.0
-12.0
-13.0
-14.0
-15.0
-16.0
-17.0
-18.0
-19.0
-20.0
-21.0
-22.0
-23.0
-24.0
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
 === Try load data from int64_decimal.parquet
-1.0
-2.0
-3.0
-4.0
-5.0
-6.0
-7.0
-8.0
-9.0
-10.0
-11.0
-12.0
-13.0
-14.0
-15.0
-16.0
-17.0
-18.0
-19.0
-20.0
-21.0
-22.0
-23.0
-24.0
+1.00
+2.00
+3.00
+4.00
+5.00
+6.00
+7.00
+8.00
+9.00
+10.00
+11.00
+12.00
+13.00
+14.00
+15.00
+16.00
+17.00
+18.00
+19.00
+20.00
+21.00
+22.00
+23.00
+24.00
+=== Try load data from list_columns.parquet
+Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
+
 === Try load data from nation.dict-malformed.parquet
 0	ALGERIA	0	 haggle. carefully final deposits detect slyly agai
 1	ARGENTINA	1	al foxes promise slyly according to the regular accounts. bold requests alon
@ -168,23 +254,25 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl
 23	UNITED KINGDOM	3	eans boost carefully special requests. accounts are. carefull
 24	UNITED STATES	1	y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
 === Try load data from nested_lists.snappy.parquet
-Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
+Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin

 === Try load data from nested_maps.snappy.parquet
-Code: 33. DB::Ex---tion: Error while reading Parquet data: NotImplemented: Reading lists of structs from Parquet files not yet supported: key_value: list<key_value: struct<key: string not null, value: struct<key_value: list<key_value: struct<key: int32 not null, value: bool not null> not null> not null>> not null> not null: data for INSERT was parsed from stdin
+Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin

+=== Try load data from non_hadoop_lz4_compressed.parquet
+1593604800	abc	42
+1593604800	def	7.7
+1593604801	abc	42.125
+1593604801	def	7.7
 === Try load data from nonnullable.impala.parquet
-Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
-
+../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192:  Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) 
 === Try load data from nullable.impala.parquet
-Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
-
+../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192:  Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id()) 
 === Try load data from nulls.snappy.parquet
-Code: 8. DB::Ex---tion: Column "b_c_int" is not presented in input data: data for INSERT was parsed from stdin
-
-=== Try load data from repeated_no_annotation.parquet
-Code: 8. DB::Ex---tion: Column "number" is not presented in input data: data for INSERT was parsed from stdin
+Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin

+=== Try load data from single_nan.parquet
+\N
 === Try load data from userdata1.parquet
 1454486129	1	Amanda	Jordan	ajordan0@com.com	Female	1.197.201.2	6759521864920116	Indonesia	3/8/1971	49756.53	Internal Auditor	1E+02
 1454519043	2	Albert	Freeman	afreeman1@is.gd	Male	218.111.175.34		Canada	1/16/1968	150280.17	Accountant IV	
--- a/tests/queries/0_stateless/00900_parquet_load.sh
+++ b/tests/queries/0_stateless/00900_parquet_load.sh
@ -5,8 +5,6 @@
 # TODO: Add more files.
 #

-# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl
-
 # Also 5 sample files from
 # wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
 # ...
@ -19,38 +17,46 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CUR_DIR"/../shell_config.sh

 CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
-[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../..
-[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../..
-[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../..
+[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../..
+[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../..

 DATA_DIR=$CUR_DIR/data_parquet

+[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader
+
 # To update:
 # cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/

-# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
-# why? repeated_no_annotation.parquet
+# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue.
+# There is failure due parsing nested arrays or nested maps with NULLs:
+# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192:  Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())

-for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort); do
+# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled:
+# debug:
+#   ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193:  Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type)
+# release:
+#   Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin
+
+for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do
    echo === Try load data from "$NAME"

    JSON=$DATA_DIR/$NAME.json
    COLUMNS_FILE=$DATA_DIR/$NAME.columns

    # If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
-    [ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$BUILD_DIR"/contrib/arrow-cmake/parquet-reader --json "$DATA_DIR"/"$NAME" > "$JSON"
-    [ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.pl "$JSON" > "$COLUMNS_FILE"
+    [ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON"
+    [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE"

    # Debug only:
-    # [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
+    # [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump

-    #COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue
+    # COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue
    COLUMNS=$(cat "$COLUMNS_FILE") || continue

    ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
    ${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory"

-    # Some files is broken, exception is ok.
+    # Some files contain unsupported data structures, exception is ok.
    cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'

    ${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100"
--- a/tests/queries/0_stateless/data_parquet/binary.parquet
+++ b/tests/queries/0_stateless/data_parquet/binary.parquet
--- a/tests/queries/0_stateless/data_parquet/binary.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/binary.parquet.columns
@ -0,0 +1 @@
+`foo` Nullable(String)
--- a/tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/byte_array_decimal.parquet.columns
@ -1 +1 @@
-`value` Nullable(Decimal128(1))
+`value` Nullable(Decimal(4, 2))
--- a/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/datapage_v2.snappy.parquet.columns
@ -1 +1 @@
-`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `element` Nullable(Int32)
+`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Nullable(Int32)
--- a/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet
+++ b/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet
--- a/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/dict-page-offset-zero.parquet.columns
@ -0,0 +1 @@
+`l_partkey` Nullable(Int32)
--- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet
+++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet
--- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal.parquet.columns
@ -0,0 +1 @@
+`value` Nullable(Decimal(25, 2))
--- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_1.parquet.columns
@ -1 +1 @@
-`value` Nullable(Decimal128(1))
+`value` Nullable(Decimal(25, 2))
--- a/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/fixed_length_decimal_legacy.parquet.columns
@ -1 +1 @@
-`value` Nullable(Decimal128(1))
+`value` Nullable(Decimal(13, 2))
--- a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet
+++ b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet
--- a/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/hadoop_lz4_compressed.parquet.columns
@ -0,0 +1 @@
+`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)
--- a/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/int32_decimal.parquet.columns
@ -1 +1 @@
-`value` Nullable(Decimal128(1))
+`value` Nullable(Decimal(4, 2))
--- a/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/int64_decimal.parquet.columns
@ -1 +1 @@
-`value` Nullable(Decimal128(1))
+`value` Nullable(Decimal(10, 2))
--- a/tests/queries/0_stateless/data_parquet/list_columns.parquet
+++ b/tests/queries/0_stateless/data_parquet/list_columns.parquet
--- a/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/list_columns.parquet.columns
@ -0,0 +1 @@
+`int64_list` Nullable(Int64), `utf8_list` Nullable(String)
--- a/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/nested_lists.snappy.parquet.columns
@ -1 +1 @@
-`element` Nullable(String), `b` Nullable(Int32)
+`a` Nullable(String), `b` Nullable(Int32)
--- a/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/nested_maps.snappy.parquet.columns
@ -1 +1 @@
-`key` Nullable(String), `key1` Nullable(Int32), `value` Nullable(UInt8), `b` Nullable(Int32), `c` Nullable(Float64)
+`a` Tuple(Nullable(String), Nullable(Int32), Nullable(UInt8)), `b` Nullable(Int32), `c` Nullable(Float64)
--- a/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet
+++ b/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet
--- a/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/non_hadoop_lz4_compressed.parquet.columns
@ -0,0 +1 @@
+`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)
--- a/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/nonnullable.impala.parquet.columns
@ -1 +1 @@
-`ID` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `a` Nullable(Int32), `element8` Nullable(Int32), `e` Nullable(Int32), `f` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
+`ID` Nullable(Int64), `Int_Array` Nullable(Int32), `int_array_array` Nullable(Int32), `Int_Map` Tuple(Nullable(String), Nullable(Int32)), `int_map_array` Tuple(Nullable(String), Nullable(Int32)), `nested_Struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))
--- a/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/nullable.impala.parquet.columns
@ -1 +1 @@
-`id` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `A` Nullable(Int32), `element8` Nullable(Int32), `E` Nullable(Int32), `F` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
+`id` Nullable(Int64), `int_array` Nullable(Int32), `int_array_Array` Nullable(Int32), `int_map` Tuple(Nullable(String), Nullable(Int32)), `int_Map_Array` Tuple(Nullable(String), Nullable(Int32)), `nested_struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))
--- a/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/nulls.snappy.parquet.columns
@ -1 +1 @@
-`b_c_int` Nullable(Int32)
+`b_struct` Nullable(Int32)
--- a/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.columns
@ -1 +0,0 @@
-`id` Nullable(Int32), `number` Nullable(Int64), `kind` Nullable(String)
--- a/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.disabled
+++ b/tests/queries/0_stateless/data_parquet/repeated_no_annotation.parquet.disabled
--- a/tests/queries/0_stateless/data_parquet/single_nan.parquet
+++ b/tests/queries/0_stateless/data_parquet/single_nan.parquet
--- a/tests/queries/0_stateless/data_parquet/single_nan.parquet.columns
+++ b/tests/queries/0_stateless/data_parquet/single_nan.parquet.columns
@ -0,0 +1 @@
+`mycol` Nullable(Float64)
--- a/tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py
+++ b/tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+
+TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE = {
+    "TIMESTAMP_MICROS": "DateTime",
+    "TIMESTAMP_MILLIS": "DateTime",
+    "UTF8": "String",
+}
+
+TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE = {
+    "BOOLEAN": "UInt8",
+    "INT32": "Int32",
+    "INT64": "Int64",
+    "FLOAT": "Float32",
+    "DOUBLE": "Float64",
+    "BYTE_ARRAY": "String",
+    "INT96": "Int64", # TODO!
+}
+
+def read_file(filename):
+    with open(filename, "rb") as f:
+        return f.read().decode("raw_unicode_escape")
+
+def get_column_name(column):
+    return column["Name"].split(".", 1)[0]
+
+def resolve_clickhouse_column_type(column):
+    column_name = get_column_name(column)
+    logical_type = column.get("LogicalType", {})
+    converted_type = column.get("ConvertedType", "").upper()
+    physical_type = column.get("PhysicalType", "").upper()
+    if logical_type and logical_type.get("Type", "").upper() == "DECIMAL":
+        precision = int(logical_type["precision"])
+        scale = int(logical_type["scale"])
+        if precision < 1 or precision > 76:
+            raise RuntimeError("Column {} has invalid Decimal precision {}".format(column_name, precision))
+        if precision > 38:
+            raise RuntimeError("Column {} has unsupported Decimal precision {}".format(column_name, precision))
+        if scale < 0 or scale > precision:
+            raise RuntimeError("Column {} has invalid Decimal scale {} for precision {}".format(column_name, scale, precision))
+        return "Decimal({}, {})".format(precision, scale)
+    if converted_type and converted_type != "NONE":
+        result_type = TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE.get(converted_type)
+        if result_type:
+            return result_type
+        raise RuntimeError("Column {} has unknown ConvertedType: {}".format(column_name, converted_type))
+    if physical_type and physical_type != "NONE":
+        result_type = TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE.get(physical_type)
+        if result_type:
+            return result_type
+        raise RuntimeError("Column {} has unknown PhysicalType: {}".format(column_name, physical_type))
+    raise RuntimeError("Column {} has invalid types: ConvertedType={}, PhysicalType={}".format(column_name, converted_type, physical_type))
+
+def dump_columns(obj):
+    descr_by_column_name = {}
+    columns_descr = []
+    for column in obj["Columns"]:
+        column_name = get_column_name(column)
+        column_type = resolve_clickhouse_column_type(column)
+        result_type = "Nullable({})".format(column_type)
+        if column_name in descr_by_column_name:
+            descr = descr_by_column_name[column_name]
+            descr["types"].append(result_type)
+        else:
+            descr = {
+                "name": column_name,
+                "types": [result_type],
+            }
+            descr_by_column_name[column_name] = descr
+            columns_descr.append(descr)
+
+    # Make tuples from nested types. CH Server doesn't support such Arrow type but it makes Server Exceptions more relevant.
+    def _format_type(types):
+        if len(types) == 1:
+            return types[0]
+        else:
+            return "Tuple({})".format(", ".join(types))
+
+    print(", ".join(map(lambda descr: "`{}` {}".format(descr["name"], _format_type(descr["types"])), columns_descr)))
+
+def dump_columns_from_file(filename):
+    dump_columns(json.loads(read_file(filename), strict=False))
+
+if __name__ == "__main__":
+    filename = sys.argv[1]
+    dump_columns_from_file(filename)
				`@ -0,0 +1 @@`
				`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)
				`@ -0,0 +1 @@`
				`int64_list` Nullable(Int64), `utf8_list` Nullable(String)
				`@ -1 +0,0 @@`
				`id` Nullable(Int32), `number` Nullable(Int64), `kind` Nullable(String)