mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-01 03:52:15 +00:00
Merge pull request #16664 from FawnD2/switch-upstream-for-arrow-submodule
Switch upstream repo for Arrow submodule
This commit is contained in:
commit
a4b0d9ba4c
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -53,7 +53,8 @@
|
|||||||
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
|
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
|
||||||
[submodule "contrib/arrow"]
|
[submodule "contrib/arrow"]
|
||||||
path = contrib/arrow
|
path = contrib/arrow
|
||||||
url = https://github.com/apache/arrow
|
url = https://github.com/ClickHouse-Extras/arrow
|
||||||
|
branch = clickhouse-arrow-2.0.0
|
||||||
[submodule "contrib/thrift"]
|
[submodule "contrib/thrift"]
|
||||||
path = contrib/thrift
|
path = contrib/thrift
|
||||||
url = https://github.com/apache/thrift.git
|
url = https://github.com/apache/thrift.git
|
||||||
|
@ -141,11 +141,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
|
|||||||
else()
|
else()
|
||||||
set(USE_INTERNAL_PARQUET_LIBRARY 1)
|
set(USE_INTERNAL_PARQUET_LIBRARY 1)
|
||||||
|
|
||||||
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
|
|
||||||
set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src")
|
|
||||||
set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(MAKE_STATIC_LIBRARIES)
|
if(MAKE_STATIC_LIBRARIES)
|
||||||
set(FLATBUFFERS_LIBRARY flatbuffers)
|
set(FLATBUFFERS_LIBRARY flatbuffers)
|
||||||
set(ARROW_LIBRARY arrow_static)
|
set(ARROW_LIBRARY arrow_static)
|
||||||
@ -155,9 +150,6 @@ if(NOT EXTERNAL_PARQUET_FOUND AND NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT O
|
|||||||
set(FLATBUFFERS_LIBRARY flatbuffers_shared)
|
set(FLATBUFFERS_LIBRARY flatbuffers_shared)
|
||||||
set(ARROW_LIBRARY arrow_shared)
|
set(ARROW_LIBRARY arrow_shared)
|
||||||
set(PARQUET_LIBRARY parquet_shared)
|
set(PARQUET_LIBRARY parquet_shared)
|
||||||
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
|
|
||||||
list(APPEND PARQUET_LIBRARY boost::regex)
|
|
||||||
endif()
|
|
||||||
set(THRIFT_LIBRARY thrift)
|
set(THRIFT_LIBRARY thrift)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
53
contrib/CMakeLists.txt
vendored
53
contrib/CMakeLists.txt
vendored
@ -163,51 +163,21 @@ if(USE_INTERNAL_SNAPPY_LIBRARY)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (USE_INTERNAL_PARQUET_LIBRARY)
|
if (USE_INTERNAL_PARQUET_LIBRARY)
|
||||||
if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
|
|
||||||
# We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
|
# We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
|
||||||
# But this mode can be used for updating auto-generated parquet files:
|
# But you can update auto-generated parquet files manually:
|
||||||
# cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0
|
# cd {BUILD_DIR}/contrib/arrow/cpp/src/parquet && mkdir -p build && cd build
|
||||||
# copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/
|
# cmake .. -DARROW_COMPUTE=ON -DARROW_PARQUET=ON -DARROW_SIMD_LEVEL=NONE -DARROW_VERBOSE_THIRDPARTY_BUILD=ON
|
||||||
|
# -DARROW_BUILD_SHARED=1 -DARROW_BUILD_UTILITIES=OFF -DARROW_BUILD_INTEGRATION=OFF
|
||||||
|
# -DBoost_FOUND=1 -DARROW_TEST_LINKAGE="shared"
|
||||||
|
# make -j8
|
||||||
|
# copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> {BUILD_DIR}/contrib/arrow-cmake/cpp/src/parquet/
|
||||||
|
|
||||||
# Also useful parquet reader:
|
# Also useful parquet reader:
|
||||||
# cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8
|
# cd {BUILD_DIR}/contrib/arrow/cpp && mkdir -p build && cd build
|
||||||
# contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet
|
# cmake .. -DARROW_PARQUET=1 -DARROW_WITH_SNAPPY=1 -DPARQUET_BUILD_EXECUTABLES=1
|
||||||
|
# make -j8
|
||||||
|
# {BUILD_DIR}/contrib/arrow/cpp/build/release/parquet-reader some_file.parquet
|
||||||
|
|
||||||
set (ARROW_COMPUTE ON CACHE INTERNAL "")
|
|
||||||
set (ARROW_PARQUET ON CACHE INTERNAL "")
|
|
||||||
set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "")
|
|
||||||
set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "")
|
|
||||||
set (ARROW_BUILD_UTILITIES OFF CACHE INTERNAL "")
|
|
||||||
set (ARROW_BUILD_INTEGRATION OFF CACHE INTERNAL "")
|
|
||||||
set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "")
|
|
||||||
set (Boost_FOUND 1 CACHE INTERNAL "")
|
|
||||||
if (MAKE_STATIC_LIBRARIES)
|
|
||||||
set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "")
|
|
||||||
set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "")
|
|
||||||
set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "")
|
|
||||||
else ()
|
|
||||||
set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "")
|
|
||||||
set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO")
|
|
||||||
set (_save_build_type ${CMAKE_BUILD_TYPE})
|
|
||||||
set (CMAKE_BUILD_TYPE Release)
|
|
||||||
string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
# Because Arrow uses CMAKE_SOURCE_DIR as a project path
|
|
||||||
# Hopefully will be fixed in https://github.com/apache/arrow/pull/2676
|
|
||||||
set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules")
|
|
||||||
add_subdirectory (arrow/cpp)
|
|
||||||
|
|
||||||
if (_save_build_type)
|
|
||||||
set (CMAKE_BUILD_TYPE ${_save_build_type})
|
|
||||||
unset (_save_build_type)
|
|
||||||
string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
else()
|
|
||||||
add_subdirectory(arrow-cmake)
|
add_subdirectory(arrow-cmake)
|
||||||
|
|
||||||
# The library is large - avoid bloat.
|
# The library is large - avoid bloat.
|
||||||
@ -215,7 +185,6 @@ else()
|
|||||||
target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0)
|
target_compile_options (${THRIFT_LIBRARY} PRIVATE -g0)
|
||||||
target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0)
|
target_compile_options (${PARQUET_LIBRARY} PRIVATE -g0)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
if (USE_INTERNAL_AVRO_LIBRARY)
|
if (USE_INTERNAL_AVRO_LIBRARY)
|
||||||
add_subdirectory(avro-cmake)
|
add_subdirectory(avro-cmake)
|
||||||
|
2
contrib/arrow
vendored
2
contrib/arrow
vendored
@ -1 +1 @@
|
|||||||
Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b
|
Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4
|
@ -144,15 +144,16 @@ set(ORC_SRCS
|
|||||||
|
|
||||||
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)
|
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)
|
||||||
|
|
||||||
configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/arrow/util/config.h")
|
configure_file("${LIBRARY_DIR}/util/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/cpp/src/arrow/util/config.h")
|
||||||
|
|
||||||
# arrow/cpp/src/arrow/CMakeLists.txt
|
# arrow/cpp/src/arrow/CMakeLists.txt
|
||||||
set(ARROW_SRCS
|
set(ARROW_SRCS
|
||||||
${LIBRARY_DIR}/array.cc
|
|
||||||
${LIBRARY_DIR}/buffer.cc
|
${LIBRARY_DIR}/buffer.cc
|
||||||
${LIBRARY_DIR}/device.cc
|
|
||||||
${LIBRARY_DIR}/builder.cc
|
${LIBRARY_DIR}/builder.cc
|
||||||
|
${LIBRARY_DIR}/chunked_array.cc
|
||||||
${LIBRARY_DIR}/compare.cc
|
${LIBRARY_DIR}/compare.cc
|
||||||
|
${LIBRARY_DIR}/datum.cc
|
||||||
|
${LIBRARY_DIR}/device.cc
|
||||||
${LIBRARY_DIR}/extension_type.cc
|
${LIBRARY_DIR}/extension_type.cc
|
||||||
${LIBRARY_DIR}/memory_pool.cc
|
${LIBRARY_DIR}/memory_pool.cc
|
||||||
${LIBRARY_DIR}/pretty_print.cc
|
${LIBRARY_DIR}/pretty_print.cc
|
||||||
@ -167,11 +168,12 @@ set(ARROW_SRCS
|
|||||||
${LIBRARY_DIR}/type.cc
|
${LIBRARY_DIR}/type.cc
|
||||||
${LIBRARY_DIR}/visitor.cc
|
${LIBRARY_DIR}/visitor.cc
|
||||||
|
|
||||||
${LIBRARY_DIR}/tensor/coo_converter.cc
|
${LIBRARY_DIR}/array/array_base.cc
|
||||||
${LIBRARY_DIR}/tensor/csc_converter.cc
|
${LIBRARY_DIR}/array/array_binary.cc
|
||||||
${LIBRARY_DIR}/tensor/csf_converter.cc
|
${LIBRARY_DIR}/array/array_decimal.cc
|
||||||
${LIBRARY_DIR}/tensor/csr_converter.cc
|
${LIBRARY_DIR}/array/array_dict.cc
|
||||||
|
${LIBRARY_DIR}/array/array_nested.cc
|
||||||
|
${LIBRARY_DIR}/array/array_primitive.cc
|
||||||
${LIBRARY_DIR}/array/builder_adaptive.cc
|
${LIBRARY_DIR}/array/builder_adaptive.cc
|
||||||
${LIBRARY_DIR}/array/builder_base.cc
|
${LIBRARY_DIR}/array/builder_base.cc
|
||||||
${LIBRARY_DIR}/array/builder_binary.cc
|
${LIBRARY_DIR}/array/builder_binary.cc
|
||||||
@ -181,17 +183,50 @@ set(ARROW_SRCS
|
|||||||
${LIBRARY_DIR}/array/builder_primitive.cc
|
${LIBRARY_DIR}/array/builder_primitive.cc
|
||||||
${LIBRARY_DIR}/array/builder_union.cc
|
${LIBRARY_DIR}/array/builder_union.cc
|
||||||
${LIBRARY_DIR}/array/concatenate.cc
|
${LIBRARY_DIR}/array/concatenate.cc
|
||||||
${LIBRARY_DIR}/array/dict_internal.cc
|
${LIBRARY_DIR}/array/data.cc
|
||||||
${LIBRARY_DIR}/array/diff.cc
|
${LIBRARY_DIR}/array/diff.cc
|
||||||
|
${LIBRARY_DIR}/array/util.cc
|
||||||
${LIBRARY_DIR}/array/validate.cc
|
${LIBRARY_DIR}/array/validate.cc
|
||||||
|
|
||||||
${LIBRARY_DIR}/csv/converter.cc
|
${LIBRARY_DIR}/compute/api_scalar.cc
|
||||||
|
${LIBRARY_DIR}/compute/api_vector.cc
|
||||||
|
${LIBRARY_DIR}/compute/cast.cc
|
||||||
|
${LIBRARY_DIR}/compute/exec.cc
|
||||||
|
${LIBRARY_DIR}/compute/function.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernel.cc
|
||||||
|
${LIBRARY_DIR}/compute/registry.cc
|
||||||
|
|
||||||
|
${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/codegen_internal.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_compare.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_fill_null.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_nested.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_string.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/scalar_validity.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/vector_hash.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/vector_nested.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/vector_selection.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/vector_sort.cc
|
||||||
|
${LIBRARY_DIR}/compute/kernels/util_internal.cc
|
||||||
|
|
||||||
${LIBRARY_DIR}/csv/chunker.cc
|
${LIBRARY_DIR}/csv/chunker.cc
|
||||||
${LIBRARY_DIR}/csv/column_builder.cc
|
${LIBRARY_DIR}/csv/column_builder.cc
|
||||||
|
${LIBRARY_DIR}/csv/column_decoder.cc
|
||||||
|
${LIBRARY_DIR}/csv/converter.cc
|
||||||
${LIBRARY_DIR}/csv/options.cc
|
${LIBRARY_DIR}/csv/options.cc
|
||||||
${LIBRARY_DIR}/csv/parser.cc
|
${LIBRARY_DIR}/csv/parser.cc
|
||||||
${LIBRARY_DIR}/csv/reader.cc
|
${LIBRARY_DIR}/csv/reader.cc
|
||||||
${LIBRARY_DIR}/csv/column_decoder.cc
|
|
||||||
|
|
||||||
${LIBRARY_DIR}/ipc/dictionary.cc
|
${LIBRARY_DIR}/ipc/dictionary.cc
|
||||||
${LIBRARY_DIR}/ipc/feather.cc
|
${LIBRARY_DIR}/ipc/feather.cc
|
||||||
@ -202,14 +237,25 @@ set(ARROW_SRCS
|
|||||||
${LIBRARY_DIR}/ipc/writer.cc
|
${LIBRARY_DIR}/ipc/writer.cc
|
||||||
|
|
||||||
${LIBRARY_DIR}/io/buffered.cc
|
${LIBRARY_DIR}/io/buffered.cc
|
||||||
|
${LIBRARY_DIR}/io/caching.cc
|
||||||
${LIBRARY_DIR}/io/compressed.cc
|
${LIBRARY_DIR}/io/compressed.cc
|
||||||
${LIBRARY_DIR}/io/file.cc
|
${LIBRARY_DIR}/io/file.cc
|
||||||
${LIBRARY_DIR}/io/interfaces.cc
|
${LIBRARY_DIR}/io/interfaces.cc
|
||||||
${LIBRARY_DIR}/io/memory.cc
|
${LIBRARY_DIR}/io/memory.cc
|
||||||
${LIBRARY_DIR}/io/slow.cc
|
${LIBRARY_DIR}/io/slow.cc
|
||||||
|
|
||||||
|
${LIBRARY_DIR}/tensor/coo_converter.cc
|
||||||
|
${LIBRARY_DIR}/tensor/csf_converter.cc
|
||||||
|
${LIBRARY_DIR}/tensor/csx_converter.cc
|
||||||
|
|
||||||
${LIBRARY_DIR}/util/basic_decimal.cc
|
${LIBRARY_DIR}/util/basic_decimal.cc
|
||||||
|
${LIBRARY_DIR}/util/bit_block_counter.cc
|
||||||
|
${LIBRARY_DIR}/util/bit_run_reader.cc
|
||||||
${LIBRARY_DIR}/util/bit_util.cc
|
${LIBRARY_DIR}/util/bit_util.cc
|
||||||
|
${LIBRARY_DIR}/util/bitmap.cc
|
||||||
|
${LIBRARY_DIR}/util/bitmap_builders.cc
|
||||||
|
${LIBRARY_DIR}/util/bitmap_ops.cc
|
||||||
|
${LIBRARY_DIR}/util/bpacking.cc
|
||||||
${LIBRARY_DIR}/util/compression.cc
|
${LIBRARY_DIR}/util/compression.cc
|
||||||
${LIBRARY_DIR}/util/compression_lz4.cc
|
${LIBRARY_DIR}/util/compression_lz4.cc
|
||||||
${LIBRARY_DIR}/util/compression_snappy.cc
|
${LIBRARY_DIR}/util/compression_snappy.cc
|
||||||
@ -217,8 +263,12 @@ set(ARROW_SRCS
|
|||||||
${LIBRARY_DIR}/util/compression_zstd.cc
|
${LIBRARY_DIR}/util/compression_zstd.cc
|
||||||
${LIBRARY_DIR}/util/cpu_info.cc
|
${LIBRARY_DIR}/util/cpu_info.cc
|
||||||
${LIBRARY_DIR}/util/decimal.cc
|
${LIBRARY_DIR}/util/decimal.cc
|
||||||
|
${LIBRARY_DIR}/util/delimiting.cc
|
||||||
|
${LIBRARY_DIR}/util/formatting.cc
|
||||||
|
${LIBRARY_DIR}/util/future.cc
|
||||||
${LIBRARY_DIR}/util/int_util.cc
|
${LIBRARY_DIR}/util/int_util.cc
|
||||||
${LIBRARY_DIR}/util/io_util.cc
|
${LIBRARY_DIR}/util/io_util.cc
|
||||||
|
${LIBRARY_DIR}/util/iterator.cc
|
||||||
${LIBRARY_DIR}/util/key_value_metadata.cc
|
${LIBRARY_DIR}/util/key_value_metadata.cc
|
||||||
${LIBRARY_DIR}/util/logging.cc
|
${LIBRARY_DIR}/util/logging.cc
|
||||||
${LIBRARY_DIR}/util/memory.cc
|
${LIBRARY_DIR}/util/memory.cc
|
||||||
@ -226,27 +276,15 @@ set(ARROW_SRCS
|
|||||||
${LIBRARY_DIR}/util/string.cc
|
${LIBRARY_DIR}/util/string.cc
|
||||||
${LIBRARY_DIR}/util/task_group.cc
|
${LIBRARY_DIR}/util/task_group.cc
|
||||||
${LIBRARY_DIR}/util/thread_pool.cc
|
${LIBRARY_DIR}/util/thread_pool.cc
|
||||||
|
${LIBRARY_DIR}/util/time.cc
|
||||||
${LIBRARY_DIR}/util/trie.cc
|
${LIBRARY_DIR}/util/trie.cc
|
||||||
${LIBRARY_DIR}/util/utf8.cc
|
${LIBRARY_DIR}/util/utf8.cc
|
||||||
${LIBRARY_DIR}/util/future.cc
|
${LIBRARY_DIR}/util/value_parsing.cc
|
||||||
${LIBRARY_DIR}/util/formatting.cc
|
|
||||||
${LIBRARY_DIR}/util/parsing.cc
|
|
||||||
${LIBRARY_DIR}/util/time.cc
|
|
||||||
${LIBRARY_DIR}/util/delimiting.cc
|
|
||||||
${LIBRARY_DIR}/util/iterator.cc
|
|
||||||
|
|
||||||
${LIBRARY_DIR}/vendored/base64.cpp
|
${LIBRARY_DIR}/vendored/base64.cpp
|
||||||
${ORC_SRCS}
|
${ORC_SRCS}
|
||||||
)
|
)
|
||||||
|
|
||||||
set(ARROW_SRCS ${ARROW_SRCS}
|
|
||||||
${LIBRARY_DIR}/compute/context.cc
|
|
||||||
${LIBRARY_DIR}/compute/kernels/boolean.cc
|
|
||||||
${LIBRARY_DIR}/compute/kernels/cast.cc
|
|
||||||
${LIBRARY_DIR}/compute/kernels/hash.cc
|
|
||||||
${LIBRARY_DIR}/compute/kernels/util_internal.cc
|
|
||||||
)
|
|
||||||
|
|
||||||
if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
|
if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
|
||||||
set(ARROW_WITH_SNAPPY 1)
|
set(ARROW_WITH_SNAPPY 1)
|
||||||
endif ()
|
endif ()
|
||||||
@ -289,7 +327,8 @@ if (USE_INTERNAL_PROTOBUF_LIBRARY)
|
|||||||
add_dependencies(${ARROW_LIBRARY} protoc)
|
add_dependencies(${ARROW_LIBRARY} protoc)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
|
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src)
|
||||||
|
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/cpp/src)
|
||||||
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY})
|
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY})
|
||||||
target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4)
|
target_link_libraries(${ARROW_LIBRARY} PRIVATE lz4)
|
||||||
if (ARROW_WITH_SNAPPY)
|
if (ARROW_WITH_SNAPPY)
|
||||||
@ -319,19 +358,26 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet)
|
|||||||
set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated)
|
set(GEN_LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/generated)
|
||||||
# arrow/cpp/src/parquet/CMakeLists.txt
|
# arrow/cpp/src/parquet/CMakeLists.txt
|
||||||
set(PARQUET_SRCS
|
set(PARQUET_SRCS
|
||||||
|
${LIBRARY_DIR}/arrow/path_internal.cc
|
||||||
${LIBRARY_DIR}/arrow/reader.cc
|
${LIBRARY_DIR}/arrow/reader.cc
|
||||||
${LIBRARY_DIR}/arrow/reader_internal.cc
|
${LIBRARY_DIR}/arrow/reader_internal.cc
|
||||||
${LIBRARY_DIR}/arrow/schema.cc
|
${LIBRARY_DIR}/arrow/schema.cc
|
||||||
|
${LIBRARY_DIR}/arrow/schema_internal.cc
|
||||||
${LIBRARY_DIR}/arrow/writer.cc
|
${LIBRARY_DIR}/arrow/writer.cc
|
||||||
${LIBRARY_DIR}/arrow/path_internal.cc
|
|
||||||
${LIBRARY_DIR}/bloom_filter.cc
|
${LIBRARY_DIR}/bloom_filter.cc
|
||||||
${LIBRARY_DIR}/column_reader.cc
|
${LIBRARY_DIR}/column_reader.cc
|
||||||
${LIBRARY_DIR}/column_scanner.cc
|
${LIBRARY_DIR}/column_scanner.cc
|
||||||
${LIBRARY_DIR}/column_writer.cc
|
${LIBRARY_DIR}/column_writer.cc
|
||||||
${LIBRARY_DIR}/deprecated_io.cc
|
${LIBRARY_DIR}/deprecated_io.cc
|
||||||
${LIBRARY_DIR}/encoding.cc
|
${LIBRARY_DIR}/encoding.cc
|
||||||
|
${LIBRARY_DIR}/encryption.cc
|
||||||
|
${LIBRARY_DIR}/encryption_internal.cc
|
||||||
${LIBRARY_DIR}/file_reader.cc
|
${LIBRARY_DIR}/file_reader.cc
|
||||||
${LIBRARY_DIR}/file_writer.cc
|
${LIBRARY_DIR}/file_writer.cc
|
||||||
|
${LIBRARY_DIR}/internal_file_decryptor.cc
|
||||||
|
${LIBRARY_DIR}/internal_file_encryptor.cc
|
||||||
|
${LIBRARY_DIR}/level_conversion.cc
|
||||||
|
${LIBRARY_DIR}/level_comparison.cc
|
||||||
${LIBRARY_DIR}/metadata.cc
|
${LIBRARY_DIR}/metadata.cc
|
||||||
${LIBRARY_DIR}/murmur3.cc
|
${LIBRARY_DIR}/murmur3.cc
|
||||||
${LIBRARY_DIR}/platform.cc
|
${LIBRARY_DIR}/platform.cc
|
||||||
@ -340,10 +386,6 @@ set(PARQUET_SRCS
|
|||||||
${LIBRARY_DIR}/schema.cc
|
${LIBRARY_DIR}/schema.cc
|
||||||
${LIBRARY_DIR}/statistics.cc
|
${LIBRARY_DIR}/statistics.cc
|
||||||
${LIBRARY_DIR}/types.cc
|
${LIBRARY_DIR}/types.cc
|
||||||
${LIBRARY_DIR}/encryption.cc
|
|
||||||
${LIBRARY_DIR}/encryption_internal.cc
|
|
||||||
${LIBRARY_DIR}/internal_file_decryptor.cc
|
|
||||||
${LIBRARY_DIR}/internal_file_encryptor.cc
|
|
||||||
|
|
||||||
${GEN_LIBRARY_DIR}/parquet_constants.cpp
|
${GEN_LIBRARY_DIR}/parquet_constants.cpp
|
||||||
${GEN_LIBRARY_DIR}/parquet_types.cpp
|
${GEN_LIBRARY_DIR}/parquet_types.cpp
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
// Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
// or more contributor license agreements. See the NOTICE file
|
|
||||||
// distributed with this work for additional information
|
|
||||||
// regarding copyright ownership. The ASF licenses this file
|
|
||||||
// to you under the Apache License, Version 2.0 (the
|
|
||||||
// "License"); you may not use this file except in compliance
|
|
||||||
// with the License. You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing,
|
|
||||||
// software distributed under the License is distributed on an
|
|
||||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
// KIND, either express or implied. See the License for the
|
|
||||||
// specific language governing permissions and limitations
|
|
||||||
// under the License.
|
|
||||||
|
|
||||||
#define ARROW_VERSION_MAJOR
|
|
||||||
#define ARROW_VERSION_MINOR
|
|
||||||
#define ARROW_VERSION_PATCH
|
|
||||||
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
|
|
||||||
|
|
||||||
#define ARROW_SO_VERSION ""
|
|
||||||
#define ARROW_FULL_SO_VERSION ""
|
|
||||||
|
|
||||||
/* #undef GRPCPP_PP_INCLUDE */
|
|
@ -22,8 +22,8 @@
|
|||||||
#define PARQUET_VERSION_MINOR 5
|
#define PARQUET_VERSION_MINOR 5
|
||||||
#define PARQUET_VERSION_PATCH 1
|
#define PARQUET_VERSION_PATCH 1
|
||||||
|
|
||||||
#define PARQUET_SO_VERSION 0
|
#define PARQUET_SO_VERSION "200"
|
||||||
#define PARQUET_FULL_SO_VERSION 0.17
|
#define PARQUET_FULL_SO_VERSION "200.0.0"
|
||||||
|
|
||||||
// define the parquet created by version
|
// define the parquet created by version
|
||||||
#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"
|
#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"
|
||||||
|
@ -363,7 +363,7 @@ endif ()
|
|||||||
|
|
||||||
if (USE_PARQUET)
|
if (USE_PARQUET)
|
||||||
dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY})
|
dbms_target_link_libraries(PRIVATE ${PARQUET_LIBRARY})
|
||||||
if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
|
if (NOT USE_INTERNAL_PARQUET_LIBRARY)
|
||||||
dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
|
dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
|
||||||
if (USE_STATIC_LIBRARIES)
|
if (USE_STATIC_LIBRARIES)
|
||||||
dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY})
|
dbms_target_link_libraries(PRIVATE ${ARROW_LIBRARY})
|
||||||
|
@ -62,9 +62,9 @@ void ArrowBlockOutputFormat::prepareWriter(const std::shared_ptr<arrow::Schema>
|
|||||||
|
|
||||||
// TODO: should we use arrow::ipc::IpcOptions::alignment?
|
// TODO: should we use arrow::ipc::IpcOptions::alignment?
|
||||||
if (stream)
|
if (stream)
|
||||||
writer_status = arrow::ipc::NewStreamWriter(arrow_ostream.get(), schema);
|
writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema);
|
||||||
else
|
else
|
||||||
writer_status = arrow::ipc::NewFileWriter(arrow_ostream.get(), schema);
|
writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema);
|
||||||
|
|
||||||
if (!writer_status.ok())
|
if (!writer_status.ok())
|
||||||
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
|
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
|
||||||
|
@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env perl
|
|
||||||
package parquet_create_table_columns;
|
|
||||||
use strict;
|
|
||||||
no warnings 'experimental';
|
|
||||||
use feature 'signatures';
|
|
||||||
use JSON::XS;
|
|
||||||
#use Data::Dumper;
|
|
||||||
|
|
||||||
sub file_read($file) {
|
|
||||||
open my $f, '<', $file or return;
|
|
||||||
local $/ = undef;
|
|
||||||
my $ret = <$f>;
|
|
||||||
close $f;
|
|
||||||
return $ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
our $type_parquet_logical_to_clickhouse = {
|
|
||||||
DECIMAL => 'Decimal128(1)',
|
|
||||||
TIMESTAMP_MICROS => 'DateTime',
|
|
||||||
TIMESTAMP_MILLIS => 'DateTime',
|
|
||||||
};
|
|
||||||
our $type_parquet_physical_to_clickhouse = {
|
|
||||||
BOOLEAN => 'UInt8',
|
|
||||||
INT32 => 'Int32',
|
|
||||||
INT64 => 'Int64',
|
|
||||||
FLOAT => 'Float32',
|
|
||||||
DOUBLE => 'Float64',
|
|
||||||
BYTE_ARRAY => 'String',
|
|
||||||
FIXED_LEN_BYTE_ARRAY => 'String', # Maybe FixedString?
|
|
||||||
INT96 => 'Int64', # TODO!
|
|
||||||
};
|
|
||||||
|
|
||||||
sub columns ($json) {
|
|
||||||
my @list;
|
|
||||||
my %uniq;
|
|
||||||
for my $column (@{$json->{Columns}}) {
|
|
||||||
#warn Data::Dumper::Dumper $column;
|
|
||||||
my $name = $column->{'Name'};
|
|
||||||
my $type = $type_parquet_logical_to_clickhouse->{$column->{'LogicalType'}} || $type_parquet_physical_to_clickhouse->{$column->{'PhysicalType'}};
|
|
||||||
unless ($type) {
|
|
||||||
warn "Unknown type [$column->{'PhysicalType'}:$column->{'LogicalType'}] of column [$name]";
|
|
||||||
}
|
|
||||||
$type = "Nullable($type)";
|
|
||||||
$name .= $column->{'Id'} if $uniq{$name}++; # Names can be non-unique
|
|
||||||
push @list, {name => $name, type => $type};
|
|
||||||
}
|
|
||||||
print join ', ', map {"`$_->{name}` $_->{type}"} @list;
|
|
||||||
}
|
|
||||||
|
|
||||||
sub columns_file ($file) {
|
|
||||||
return columns(JSON::XS::decode_json(file_read($file)));
|
|
||||||
}
|
|
||||||
|
|
||||||
columns_file(shift) unless caller;
|
|
@ -13,134 +13,220 @@
|
|||||||
=== Try load data from alltypes_plain.snappy.parquet
|
=== Try load data from alltypes_plain.snappy.parquet
|
||||||
6 1 0 0 0 0 0 0 04/01/09 0 1238544000
|
6 1 0 0 0 0 0 0 04/01/09 0 1238544000
|
||||||
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
|
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
|
||||||
|
=== Try load data from binary.parquet
|
||||||
|
\0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\b
|
||||||
|
\t
|
||||||
|
\n
|
||||||
|
|
||||||
=== Try load data from byte_array_decimal.parquet
|
=== Try load data from byte_array_decimal.parquet
|
||||||
1.0
|
1.00
|
||||||
2.0
|
2.00
|
||||||
3.0
|
3.00
|
||||||
4.0
|
4.00
|
||||||
5.0
|
5.00
|
||||||
6.0
|
6.00
|
||||||
7.0
|
7.00
|
||||||
8.0
|
8.00
|
||||||
9.0
|
9.00
|
||||||
10.0
|
10.00
|
||||||
11.0
|
11.00
|
||||||
12.0
|
12.00
|
||||||
13.0
|
13.00
|
||||||
14.0
|
14.00
|
||||||
15.0
|
15.00
|
||||||
16.0
|
16.00
|
||||||
17.0
|
17.00
|
||||||
18.0
|
18.00
|
||||||
19.0
|
19.00
|
||||||
20.0
|
20.00
|
||||||
21.0
|
21.00
|
||||||
22.0
|
22.00
|
||||||
23.0
|
23.00
|
||||||
24.0
|
24.00
|
||||||
=== Try load data from datapage_v2.snappy.parquet
|
=== Try load data from datapage_v2.snappy.parquet
|
||||||
Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: data for INSERT was parsed from stdin
|
Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet implemented: Unsupported encoding.: data for INSERT was parsed from stdin
|
||||||
|
|
||||||
|
=== Try load data from dict-page-offset-zero.parquet
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
1552
|
||||||
|
=== Try load data from fixed_length_decimal.parquet
|
||||||
|
1.00
|
||||||
|
2.00
|
||||||
|
3.00
|
||||||
|
4.00
|
||||||
|
5.00
|
||||||
|
6.00
|
||||||
|
7.00
|
||||||
|
8.00
|
||||||
|
9.00
|
||||||
|
10.00
|
||||||
|
11.00
|
||||||
|
12.00
|
||||||
|
13.00
|
||||||
|
14.00
|
||||||
|
15.00
|
||||||
|
16.00
|
||||||
|
17.00
|
||||||
|
18.00
|
||||||
|
19.00
|
||||||
|
20.00
|
||||||
|
21.00
|
||||||
|
22.00
|
||||||
|
23.00
|
||||||
|
24.00
|
||||||
=== Try load data from fixed_length_decimal_1.parquet
|
=== Try load data from fixed_length_decimal_1.parquet
|
||||||
1.0
|
1.00
|
||||||
2.0
|
2.00
|
||||||
3.0
|
3.00
|
||||||
4.0
|
4.00
|
||||||
5.0
|
5.00
|
||||||
6.0
|
6.00
|
||||||
7.0
|
7.00
|
||||||
8.0
|
8.00
|
||||||
9.0
|
9.00
|
||||||
10.0
|
10.00
|
||||||
11.0
|
11.00
|
||||||
12.0
|
12.00
|
||||||
13.0
|
13.00
|
||||||
14.0
|
14.00
|
||||||
15.0
|
15.00
|
||||||
16.0
|
16.00
|
||||||
17.0
|
17.00
|
||||||
18.0
|
18.00
|
||||||
19.0
|
19.00
|
||||||
20.0
|
20.00
|
||||||
21.0
|
21.00
|
||||||
22.0
|
22.00
|
||||||
23.0
|
23.00
|
||||||
24.0
|
24.00
|
||||||
=== Try load data from fixed_length_decimal_legacy.parquet
|
=== Try load data from fixed_length_decimal_legacy.parquet
|
||||||
1.0
|
1.00
|
||||||
2.0
|
2.00
|
||||||
3.0
|
3.00
|
||||||
4.0
|
4.00
|
||||||
5.0
|
5.00
|
||||||
6.0
|
6.00
|
||||||
7.0
|
7.00
|
||||||
8.0
|
8.00
|
||||||
9.0
|
9.00
|
||||||
10.0
|
10.00
|
||||||
11.0
|
11.00
|
||||||
12.0
|
12.00
|
||||||
13.0
|
13.00
|
||||||
14.0
|
14.00
|
||||||
15.0
|
15.00
|
||||||
16.0
|
16.00
|
||||||
17.0
|
17.00
|
||||||
18.0
|
18.00
|
||||||
19.0
|
19.00
|
||||||
20.0
|
20.00
|
||||||
21.0
|
21.00
|
||||||
22.0
|
22.00
|
||||||
23.0
|
23.00
|
||||||
24.0
|
24.00
|
||||||
|
=== Try load data from hadoop_lz4_compressed.parquet
|
||||||
|
1593604800 abc 42
|
||||||
|
1593604800 def 7.7
|
||||||
|
1593604801 abc 42.125
|
||||||
|
1593604801 def 7.7
|
||||||
=== Try load data from int32_decimal.parquet
|
=== Try load data from int32_decimal.parquet
|
||||||
1.0
|
1.00
|
||||||
2.0
|
2.00
|
||||||
3.0
|
3.00
|
||||||
4.0
|
4.00
|
||||||
5.0
|
5.00
|
||||||
6.0
|
6.00
|
||||||
7.0
|
7.00
|
||||||
8.0
|
8.00
|
||||||
9.0
|
9.00
|
||||||
10.0
|
10.00
|
||||||
11.0
|
11.00
|
||||||
12.0
|
12.00
|
||||||
13.0
|
13.00
|
||||||
14.0
|
14.00
|
||||||
15.0
|
15.00
|
||||||
16.0
|
16.00
|
||||||
17.0
|
17.00
|
||||||
18.0
|
18.00
|
||||||
19.0
|
19.00
|
||||||
20.0
|
20.00
|
||||||
21.0
|
21.00
|
||||||
22.0
|
22.00
|
||||||
23.0
|
23.00
|
||||||
24.0
|
24.00
|
||||||
=== Try load data from int64_decimal.parquet
|
=== Try load data from int64_decimal.parquet
|
||||||
1.0
|
1.00
|
||||||
2.0
|
2.00
|
||||||
3.0
|
3.00
|
||||||
4.0
|
4.00
|
||||||
5.0
|
5.00
|
||||||
6.0
|
6.00
|
||||||
7.0
|
7.00
|
||||||
8.0
|
8.00
|
||||||
9.0
|
9.00
|
||||||
10.0
|
10.00
|
||||||
11.0
|
11.00
|
||||||
12.0
|
12.00
|
||||||
13.0
|
13.00
|
||||||
14.0
|
14.00
|
||||||
15.0
|
15.00
|
||||||
16.0
|
16.00
|
||||||
17.0
|
17.00
|
||||||
18.0
|
18.00
|
||||||
19.0
|
19.00
|
||||||
20.0
|
20.00
|
||||||
21.0
|
21.00
|
||||||
22.0
|
22.00
|
||||||
23.0
|
23.00
|
||||||
24.0
|
24.00
|
||||||
|
=== Try load data from list_columns.parquet
|
||||||
|
Code: 70. DB::Ex---tion: The type "list" of an input column "int64_list" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
|
||||||
|
|
||||||
=== Try load data from nation.dict-malformed.parquet
|
=== Try load data from nation.dict-malformed.parquet
|
||||||
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
|
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
|
||||||
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
|
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
|
||||||
@ -168,23 +254,25 @@ Code: 33. DB::Ex---tion: Error while reading Parquet data: IOError: Not yet impl
|
|||||||
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
|
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
|
||||||
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
|
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
|
||||||
=== Try load data from nested_lists.snappy.parquet
|
=== Try load data from nested_lists.snappy.parquet
|
||||||
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
|
Code: 70. DB::Ex---tion: The type "list" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
|
||||||
|
|
||||||
=== Try load data from nested_maps.snappy.parquet
|
=== Try load data from nested_maps.snappy.parquet
|
||||||
Code: 33. DB::Ex---tion: Error while reading Parquet data: NotImplemented: Reading lists of structs from Parquet files not yet supported: key_value: list<key_value: struct<key: string not null, value: struct<key_value: list<key_value: struct<key: int32 not null, value: bool not null> not null> not null>> not null> not null: data for INSERT was parsed from stdin
|
Code: 70. DB::Ex---tion: The type "map" of an input column "a" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
|
||||||
|
|
||||||
|
=== Try load data from non_hadoop_lz4_compressed.parquet
|
||||||
|
1593604800 abc 42
|
||||||
|
1593604800 def 7.7
|
||||||
|
1593604801 abc 42.125
|
||||||
|
1593604801 def 7.7
|
||||||
=== Try load data from nonnullable.impala.parquet
|
=== Try load data from nonnullable.impala.parquet
|
||||||
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
|
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
|
||||||
|
|
||||||
=== Try load data from nullable.impala.parquet
|
=== Try load data from nullable.impala.parquet
|
||||||
Code: 8. DB::Ex---tion: Column "element" is not presented in input data: data for INSERT was parsed from stdin
|
../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
|
||||||
|
|
||||||
=== Try load data from nulls.snappy.parquet
|
=== Try load data from nulls.snappy.parquet
|
||||||
Code: 8. DB::Ex---tion: Column "b_c_int" is not presented in input data: data for INSERT was parsed from stdin
|
Code: 70. DB::Ex---tion: The type "struct" of an input column "b_struct" is not supported for conversion from a Parquet data format: data for INSERT was parsed from stdin
|
||||||
|
|
||||||
=== Try load data from repeated_no_annotation.parquet
|
|
||||||
Code: 8. DB::Ex---tion: Column "number" is not presented in input data: data for INSERT was parsed from stdin
|
|
||||||
|
|
||||||
|
=== Try load data from single_nan.parquet
|
||||||
|
\N
|
||||||
=== Try load data from userdata1.parquet
|
=== Try load data from userdata1.parquet
|
||||||
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
|
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
|
||||||
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV
|
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV
|
||||||
|
@ -5,8 +5,6 @@
|
|||||||
# TODO: Add more files.
|
# TODO: Add more files.
|
||||||
#
|
#
|
||||||
|
|
||||||
# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl
|
|
||||||
|
|
||||||
# Also 5 sample files from
|
# Also 5 sample files from
|
||||||
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
|
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
|
||||||
# ...
|
# ...
|
||||||
@ -19,38 +17,46 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|||||||
. "$CUR_DIR"/../shell_config.sh
|
. "$CUR_DIR"/../shell_config.sh
|
||||||
|
|
||||||
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
|
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
|
||||||
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../..
|
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../..
|
||||||
[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../..
|
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../..
|
||||||
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../..
|
|
||||||
|
|
||||||
DATA_DIR=$CUR_DIR/data_parquet
|
DATA_DIR=$CUR_DIR/data_parquet
|
||||||
|
|
||||||
|
[ -n "$ROOT_DIR" ] && [ -z "$PARQUET_READER" ] && PARQUET_READER="$ROOT_DIR"/contrib/arrow/cpp/build/release/parquet-reader
|
||||||
|
|
||||||
# To update:
|
# To update:
|
||||||
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
|
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
|
||||||
|
|
||||||
# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
|
# ClickHouse Parquet reader doesn't support such complex types, so I didn't burrow into the issue.
|
||||||
# why? repeated_no_annotation.parquet
|
# There is failure due parsing nested arrays or nested maps with NULLs:
|
||||||
|
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:192: Check failed: (self->list_type_->value_type()->id()) == (data->child_data[0]->type->id())
|
||||||
|
|
||||||
for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | sort); do
|
# Strange behaviour for repeated_no_annotation.parquet around __buitin_expect, so this file was disabled:
|
||||||
|
# debug:
|
||||||
|
# ../contrib/arrow/cpp/src/arrow/array/array_nested.cc:193: Check failed: self->list_type_->value_type()->Equals(data->child_data[0]->type)
|
||||||
|
# release:
|
||||||
|
# Code: 349. DB::Ex---tion: Can not insert NULL data into non-nullable column "phoneNumbers": data for INSERT was parsed from stdin
|
||||||
|
|
||||||
|
for NAME in $(find "$DATA_DIR"/*.parquet -print0 | xargs -0 -n 1 basename | LC_ALL=C sort); do
|
||||||
echo === Try load data from "$NAME"
|
echo === Try load data from "$NAME"
|
||||||
|
|
||||||
JSON=$DATA_DIR/$NAME.json
|
JSON=$DATA_DIR/$NAME.json
|
||||||
COLUMNS_FILE=$DATA_DIR/$NAME.columns
|
COLUMNS_FILE=$DATA_DIR/$NAME.columns
|
||||||
|
|
||||||
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
|
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
|
||||||
[ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$BUILD_DIR"/contrib/arrow-cmake/parquet-reader --json "$DATA_DIR"/"$NAME" > "$JSON"
|
[ -n "$PARQUET_READER" ] && [ ! -s "$COLUMNS_FILE" ] && [ ! -s "$JSON" ] && "$PARQUET_READER" --json "$DATA_DIR"/"$NAME" > "$JSON"
|
||||||
[ -n "$BUILD_DIR" ] && [ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/00900_parquet_create_table_columns.pl "$JSON" > "$COLUMNS_FILE"
|
[ ! -s "$COLUMNS_FILE" ] && "$CUR_DIR"/helpers/00900_parquet_create_table_columns.py "$JSON" > "$COLUMNS_FILE"
|
||||||
|
|
||||||
# Debug only:
|
# Debug only:
|
||||||
# [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
|
# [ -n "$PARQUET_READER" ] && $PARQUET_READER $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
|
||||||
|
|
||||||
#COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue
|
# COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.py $JSON` 2>&1 || continue
|
||||||
COLUMNS=$(cat "$COLUMNS_FILE") || continue
|
COLUMNS=$(cat "$COLUMNS_FILE") || continue
|
||||||
|
|
||||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
|
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load"
|
||||||
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory"
|
${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load ($COLUMNS) ENGINE = Memory"
|
||||||
|
|
||||||
# Some files is broken, exception is ok.
|
# Some files contain unsupported data structures, exception is ok.
|
||||||
cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
|
cat "$DATA_DIR"/"$NAME" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
|
||||||
|
|
||||||
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100"
|
${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load LIMIT 100"
|
||||||
|
BIN
tests/queries/0_stateless/data_parquet/binary.parquet
Normal file
BIN
tests/queries/0_stateless/data_parquet/binary.parquet
Normal file
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`foo` Nullable(String)
|
@ -1 +1 @@
|
|||||||
`value` Nullable(Decimal128(1))
|
`value` Nullable(Decimal(4, 2))
|
||||||
|
@ -1 +1 @@
|
|||||||
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `element` Nullable(Int32)
|
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Nullable(Int32)
|
||||||
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`l_partkey` Nullable(Int32)
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`value` Nullable(Decimal(25, 2))
|
@ -1 +1 @@
|
|||||||
`value` Nullable(Decimal128(1))
|
`value` Nullable(Decimal(25, 2))
|
||||||
|
@ -1 +1 @@
|
|||||||
`value` Nullable(Decimal128(1))
|
`value` Nullable(Decimal(13, 2))
|
||||||
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)
|
@ -1 +1 @@
|
|||||||
`value` Nullable(Decimal128(1))
|
`value` Nullable(Decimal(4, 2))
|
||||||
|
@ -1 +1 @@
|
|||||||
`value` Nullable(Decimal128(1))
|
`value` Nullable(Decimal(10, 2))
|
||||||
|
BIN
tests/queries/0_stateless/data_parquet/list_columns.parquet
Normal file
BIN
tests/queries/0_stateless/data_parquet/list_columns.parquet
Normal file
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`int64_list` Nullable(Int64), `utf8_list` Nullable(String)
|
@ -1 +1 @@
|
|||||||
`element` Nullable(String), `b` Nullable(Int32)
|
`a` Nullable(String), `b` Nullable(Int32)
|
||||||
|
@ -1 +1 @@
|
|||||||
`key` Nullable(String), `key1` Nullable(Int32), `value` Nullable(UInt8), `b` Nullable(Int32), `c` Nullable(Float64)
|
`a` Tuple(Nullable(String), Nullable(Int32), Nullable(UInt8)), `b` Nullable(Int32), `c` Nullable(Float64)
|
||||||
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`c0` Nullable(Int64), `c1` Nullable(String), `v11` Nullable(Float64)
|
@ -1 +1 @@
|
|||||||
`ID` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `a` Nullable(Int32), `element8` Nullable(Int32), `e` Nullable(Int32), `f` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
|
`ID` Nullable(Int64), `Int_Array` Nullable(Int32), `int_array_array` Nullable(Int32), `Int_Map` Tuple(Nullable(String), Nullable(Int32)), `int_map_array` Tuple(Nullable(String), Nullable(Int32)), `nested_Struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))
|
||||||
|
@ -1 +1 @@
|
|||||||
`id` Nullable(Int64), `element` Nullable(Int32), `element2` Nullable(Int32), `key` Nullable(String), `value` Nullable(Int32), `key5` Nullable(String), `value6` Nullable(Int32), `A` Nullable(Int32), `element8` Nullable(Int32), `E` Nullable(Int32), `F` Nullable(String), `key11` Nullable(String), `element12` Nullable(Float64)
|
`id` Nullable(Int64), `int_array` Nullable(Int32), `int_array_Array` Nullable(Int32), `int_map` Tuple(Nullable(String), Nullable(Int32)), `int_Map_Array` Tuple(Nullable(String), Nullable(Int32)), `nested_struct` Tuple(Nullable(Int32), Nullable(Int32), Nullable(Int32), Nullable(String), Nullable(String), Nullable(Float64))
|
||||||
|
@ -1 +1 @@
|
|||||||
`b_c_int` Nullable(Int32)
|
`b_struct` Nullable(Int32)
|
||||||
|
@ -1 +0,0 @@
|
|||||||
`id` Nullable(Int32), `number` Nullable(Int64), `kind` Nullable(String)
|
|
BIN
tests/queries/0_stateless/data_parquet/single_nan.parquet
Normal file
BIN
tests/queries/0_stateless/data_parquet/single_nan.parquet
Normal file
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
`mycol` Nullable(Float64)
|
88
tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py
Executable file
88
tests/queries/0_stateless/helpers/00900_parquet_create_table_columns.py
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE = {
|
||||||
|
"TIMESTAMP_MICROS": "DateTime",
|
||||||
|
"TIMESTAMP_MILLIS": "DateTime",
|
||||||
|
"UTF8": "String",
|
||||||
|
}
|
||||||
|
|
||||||
|
TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE = {
|
||||||
|
"BOOLEAN": "UInt8",
|
||||||
|
"INT32": "Int32",
|
||||||
|
"INT64": "Int64",
|
||||||
|
"FLOAT": "Float32",
|
||||||
|
"DOUBLE": "Float64",
|
||||||
|
"BYTE_ARRAY": "String",
|
||||||
|
"INT96": "Int64", # TODO!
|
||||||
|
}
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
return f.read().decode("raw_unicode_escape")
|
||||||
|
|
||||||
|
def get_column_name(column):
|
||||||
|
return column["Name"].split(".", 1)[0]
|
||||||
|
|
||||||
|
def resolve_clickhouse_column_type(column):
|
||||||
|
column_name = get_column_name(column)
|
||||||
|
logical_type = column.get("LogicalType", {})
|
||||||
|
converted_type = column.get("ConvertedType", "").upper()
|
||||||
|
physical_type = column.get("PhysicalType", "").upper()
|
||||||
|
if logical_type and logical_type.get("Type", "").upper() == "DECIMAL":
|
||||||
|
precision = int(logical_type["precision"])
|
||||||
|
scale = int(logical_type["scale"])
|
||||||
|
if precision < 1 or precision > 76:
|
||||||
|
raise RuntimeError("Column {} has invalid Decimal precision {}".format(column_name, precision))
|
||||||
|
if precision > 38:
|
||||||
|
raise RuntimeError("Column {} has unsupported Decimal precision {}".format(column_name, precision))
|
||||||
|
if scale < 0 or scale > precision:
|
||||||
|
raise RuntimeError("Column {} has invalid Decimal scale {} for precision {}".format(column_name, scale, precision))
|
||||||
|
return "Decimal({}, {})".format(precision, scale)
|
||||||
|
if converted_type and converted_type != "NONE":
|
||||||
|
result_type = TYPE_PARQUET_CONVERTED_TO_CLICKHOUSE.get(converted_type)
|
||||||
|
if result_type:
|
||||||
|
return result_type
|
||||||
|
raise RuntimeError("Column {} has unknown ConvertedType: {}".format(column_name, converted_type))
|
||||||
|
if physical_type and physical_type != "NONE":
|
||||||
|
result_type = TYPE_PARQUET_PHYSICAL_TO_CLICKHOUSE.get(physical_type)
|
||||||
|
if result_type:
|
||||||
|
return result_type
|
||||||
|
raise RuntimeError("Column {} has unknown PhysicalType: {}".format(column_name, physical_type))
|
||||||
|
raise RuntimeError("Column {} has invalid types: ConvertedType={}, PhysicalType={}".format(column_name, converted_type, physical_type))
|
||||||
|
|
||||||
|
def dump_columns(obj):
|
||||||
|
descr_by_column_name = {}
|
||||||
|
columns_descr = []
|
||||||
|
for column in obj["Columns"]:
|
||||||
|
column_name = get_column_name(column)
|
||||||
|
column_type = resolve_clickhouse_column_type(column)
|
||||||
|
result_type = "Nullable({})".format(column_type)
|
||||||
|
if column_name in descr_by_column_name:
|
||||||
|
descr = descr_by_column_name[column_name]
|
||||||
|
descr["types"].append(result_type)
|
||||||
|
else:
|
||||||
|
descr = {
|
||||||
|
"name": column_name,
|
||||||
|
"types": [result_type],
|
||||||
|
}
|
||||||
|
descr_by_column_name[column_name] = descr
|
||||||
|
columns_descr.append(descr)
|
||||||
|
|
||||||
|
# Make tuples from nested types. CH Server doesn't support such Arrow type but it makes Server Exceptions more relevant.
|
||||||
|
def _format_type(types):
|
||||||
|
if len(types) == 1:
|
||||||
|
return types[0]
|
||||||
|
else:
|
||||||
|
return "Tuple({})".format(", ".join(types))
|
||||||
|
|
||||||
|
print(", ".join(map(lambda descr: "`{}` {}".format(descr["name"], _format_type(descr["types"])), columns_descr)))
|
||||||
|
|
||||||
|
def dump_columns_from_file(filename):
|
||||||
|
dump_columns(json.loads(read_file(filename), strict=False))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
filename = sys.argv[1]
|
||||||
|
dump_columns_from_file(filename)
|
Loading…
Reference in New Issue
Block a user