Add parquet support. Fixes, tests, ...

This commit is contained in:
proller 2019-02-19 23:51:44 +03:00
parent 1def68be6f
commit e20c77e4c6
103 changed files with 11778 additions and 798 deletions

14
.gitmodules vendored
View File

@ -48,10 +48,16 @@
url = https://github.com/ClickHouse-Extras/protobuf.git
[submodule "contrib/boost"]
path = contrib/boost
url = https://github.com/ClickHouse-Extras/boost-extra.git
url = https://github.com/ClickHouse-Extras/boost.git
[submodule "contrib/base64"]
path = contrib/base64
url = https://github.com/aklomp/base64.git
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/apache/arrow
[submodule "contrib/thrift"]
path = contrib/thrift
url = https://github.com/apache/thrift.git
[submodule "contrib/libhdfs3"]
path = contrib/libhdfs3
url = https://github.com/ClickHouse-Extras/libhdfs3.git
@ -61,12 +67,12 @@
[submodule "contrib/libgsasl"]
path = contrib/libgsasl
url = https://github.com/ClickHouse-Extras/libgsasl.git
[submodule "contrib/snappy"]
path = contrib/snappy
url = https://github.com/google/snappy
[submodule "contrib/cppkafka"]
path = contrib/cppkafka
url = https://github.com/ClickHouse-Extras/cppkafka.git
[submodule "contrib/brotli"]
path = contrib/brotli
url = https://github.com/google/brotli.git
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/apache/arrow

View File

@ -224,6 +224,7 @@ endif ()
message (STATUS "Building for: ${CMAKE_SYSTEM} ${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_LIBRARY_ARCHITECTURE} ; USE_STATIC_LIBRARIES=${USE_STATIC_LIBRARIES} MAKE_STATIC_LIBRARIES=${MAKE_STATIC_LIBRARIES} SPLIT_SHARED=${SPLIT_SHARED_LIBRARIES} UNBUNDLED=${UNBUNDLED} CCACHE=${CCACHE_FOUND} ${CCACHE_VERSION}")
include(GNUInstallDirs)
include (cmake/find_contrib_lib.cmake)
include (cmake/find_ssl.cmake)
include (cmake/lib_name.cmake)
@ -258,17 +259,15 @@ include (cmake/find_pdqsort.cmake)
include (cmake/find_hdfs3.cmake) # uses protobuf
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
if (ENABLE_TESTS)
include (cmake/find_gtest.cmake)
endif ()
include (cmake/find_parquet.cmake)
include (cmake/find_contrib_lib.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)
find_contrib_lib(btrie)
find_contrib_lib(double-conversion)
include (cmake/find_parquet.cmake)
if (ENABLE_TESTS)
include (cmake/find_gtest.cmake)
endif ()
# Need to process before "contrib" dir:
include (libs/libcommon/cmake/find_gperftools.cmake)

View File

@ -9,7 +9,7 @@ endif ()
if (NOT USE_INTERNAL_BOOST_LIBRARY)
set (Boost_USE_STATIC_LIBS ${USE_STATIC_LIBRARIES})
set (BOOST_ROOT "/usr/local")
find_package (Boost 1.60 COMPONENTS program_options system filesystem thread)
find_package (Boost 1.60 COMPONENTS program_options system filesystem thread regex)
# incomplete, no include search, who use it?
if (NOT Boost_FOUND)
# # Try to find manually.
@ -29,9 +29,12 @@ if (NOT Boost_SYSTEM_LIBRARY)
set (Boost_SYSTEM_LIBRARY boost_system_internal)
set (Boost_PROGRAM_OPTIONS_LIBRARY boost_program_options_internal)
set (Boost_FILESYSTEM_LIBRARY boost_filesystem_internal ${Boost_SYSTEM_LIBRARY})
set (Boost_REGEX_LIBRARY boost_regex_internal)
set (Boost_INCLUDE_DIRS)
set (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost")
# For boost from github:
file (GLOB Boost_INCLUDE_DIRS_ "${ClickHouse_SOURCE_DIR}/contrib/boost/libs/*/include")
list (APPEND Boost_INCLUDE_DIRS ${Boost_INCLUDE_DIRS_})
@ -44,4 +47,4 @@ if (NOT Boost_SYSTEM_LIBRARY)
endif ()
message (STATUS "Using Boost: ${Boost_INCLUDE_DIRS} : ${Boost_PROGRAM_OPTIONS_LIBRARY},${Boost_SYSTEM_LIBRARY},${Boost_FILESYSTEM_LIBRARY}")
message (STATUS "Using Boost: ${Boost_INCLUDE_DIRS} : ${Boost_PROGRAM_OPTIONS_LIBRARY},${Boost_SYSTEM_LIBRARY},${Boost_FILESYSTEM_LIBRARY},${Boost_REGEX_LIBRARY}")

View File

@ -9,8 +9,9 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libxml2/libxml.h")
endif ()
if (NOT USE_INTERNAL_LIBXML2_LIBRARY)
find_library (LIBXML2_LIBRARY libxml2)
find_path (LIBXML2_INCLUDE_DIR NAMES libxml.h PATHS ${LIBXML2_INCLUDE_PATHS})
find_package (LibXml2)
#find_library (LIBXML2_LIBRARY libxml2)
#find_path (LIBXML2_INCLUDE_DIR NAMES libxml.h PATHS ${LIBXML2_INCLUDE_PATHS})
endif ()
if (LIBXML2_LIBRARY AND LIBXML2_INCLUDE_DIR)

View File

@ -1,8 +1,11 @@
option (USE_INTERNAL_LZ4_LIBRARY "Set to FALSE to use system lz4 library instead of bundled" ${NOT_UNBUNDLED})
if (USE_INTERNAL_LZ4_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lz4/lib/lz4.h")
message (WARNING "submodule contrib/lz4 is missing. to fix try run: \n git submodule update --init --recursive")
set (USE_INTERNAL_LZ4_LIBRARY 0)
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lz4/lib/lz4.h")
if (USE_INTERNAL_LZ4_LIBRARY)
message (WARNING "submodule contrib/lz4 is missing. to fix try run: \n git submodule update --init --recursive")
set (USE_INTERNAL_LZ4_LIBRARY 0)
endif ()
set (MISSING_INTERNAL_LZ4_LIBRARY 1)
endif ()
if (NOT USE_INTERNAL_LZ4_LIBRARY)
@ -11,7 +14,7 @@ if (NOT USE_INTERNAL_LZ4_LIBRARY)
endif ()
if (LZ4_LIBRARY AND LZ4_INCLUDE_DIR)
else ()
elseif (NOT MISSING_INTERNAL_LZ4_LIBRARY)
set (LZ4_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/lz4/lib)
set (USE_INTERNAL_LZ4_LIBRARY 1)
set (LZ4_LIBRARY lz4)

View File

@ -1,31 +1,68 @@
option (USE_INTERNAL_PARQUET_LIBRARY "Set to FALSE to use system parquet library instead of bundled" ${NOT_UNBUNDLED})
if (NOT OS_FREEBSD) # Freebsd: ../contrib/arrow/cpp/src/arrow/util/bit-util.h:27:10: fatal error: endian.h: No such file or directory
option(USE_INTERNAL_PARQUET_LIBRARY "Set to FALSE to use system parquet library instead of bundled" ${NOT_UNBUNDLED})
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/CMakeLists.txt")
if (USE_INTERNAL_PARQUET_LIBRARY)
message (WARNING "submodule contrib/arrow (required for Parquet) is missing. to fix try run: \n git submodule update --init --recursive")
endif ()
set (USE_INTERNAL_PARQUET_LIBRARY 0)
set (MISSING_INTERNAL_PARQUET_LIBRARY 1)
endif ()
if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/CMakeLists.txt")
if(USE_INTERNAL_PARQUET_LIBRARY)
message(WARNING "submodule contrib/arrow (required for Parquet) is missing. to fix try run: \n git submodule update --init --recursive")
endif()
set(USE_INTERNAL_PARQUET_LIBRARY 0)
set(MISSING_INTERNAL_PARQUET_LIBRARY 1)
endif()
if (NOT USE_INTERNAL_PARQUET_LIBRARY)
find_package (Arrow)
find_package (Parquet)
endif ()
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
find_package(Arrow)
find_package(Parquet)
endif()
if (ARROW_INCLUDE_DIR AND PARQUET_INCLUDE_DIR)
elseif (NOT MISSING_INTERNAL_PARQUET_LIBRARY)
set (USE_INTERNAL_PARQUET_LIBRARY 1)
# TODO: is it required?
# set (ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow")
# set (PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet")
set (ARROW_LIBRARY arrow_static)
set (PARQUET_LIBRARY parquet_static)
set (USE_PARQUET 1)
endif ()
if(ARROW_INCLUDE_DIR AND PARQUET_INCLUDE_DIR)
elseif(NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT OS_FREEBSD)
include(cmake/find_snappy.cmake)
set(CAN_USE_INTERNAL_PARQUET_LIBRARY 1)
include(CheckCXXSourceCompiles)
if(NOT USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
set(CMAKE_REQUIRED_LIBRARIES ${DOUBLE_CONVERSION_LIBRARIES})
set(CMAKE_REQUIRED_INCLUDES ${DOUBLE_CONVERSION_INCLUDE_DIR})
check_cxx_source_compiles("
#include <double-conversion/double-conversion.h>
int main() { static const int flags_ = double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY; return 0;}
" HAVE_DOUBLE_CONVERSION_ALLOW_CASE_INSENSIBILITY)
if (USE_PARQUET)
message (STATUS "Using Parquet: ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}")
else ()
message (STATUS "Building without Parquet support")
endif ()
if(NOT HAVE_DOUBLE_CONVERSION_ALLOW_CASE_INSENSIBILITY) # HAVE_STD_RANDOM_SHUFFLE
message(STATUS "Disabling internal parquet library because arrow is broken (can't use old double_conversion)")
set(CAN_USE_INTERNAL_PARQUET_LIBRARY 0)
endif()
endif()
if(NOT CAN_USE_INTERNAL_PARQUET_LIBRARY)
set(USE_INTERNAL_PARQUET_LIBRARY 0)
else()
set(USE_INTERNAL_PARQUET_LIBRARY 1)
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
set(ARROW_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src")
set(PARQUET_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src" ${ClickHouse_BINARY_DIR}/contrib/arrow/cpp/src)
endif()
if(${USE_STATIC_LIBRARIES})
set(ARROW_LIBRARY arrow_static)
set(PARQUET_LIBRARY parquet_static)
set(THRIFT_LIBRARY thrift_static)
else()
set(ARROW_LIBRARY arrow_shared)
set(PARQUET_LIBRARY parquet_shared)
if(USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
list(APPEND PARQUET_LIBRARY ${Boost_REGEX_LIBRARY})
endif()
set(THRIFT_LIBRARY thrift)
endif()
set(USE_PARQUET 1)
endif()
endif()
if(USE_PARQUET)
message(STATUS "Using Parquet: ${ARROW_LIBRARY}:${ARROW_INCLUDE_DIR} ; ${PARQUET_LIBRARY}:${PARQUET_INCLUDE_DIR} ; ${THRIFT_LIBRARY}")
else()
message(STATUS "Building without Parquet support")
endif()

27
cmake/find_snappy.cmake Normal file
View File

@ -0,0 +1,27 @@
option(USE_INTERNAL_SNAPPY_LIBRARY "Set to FALSE to use system snappy library instead of bundled" ${NOT_UNBUNDLED})
if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/snappy/snappy.h")
if(USE_INTERNAL_SNAPPY_LIBRARY)
message(WARNING "submodule contrib/snappy is missing. to fix try run: \n git submodule update --init --recursive")
set(USE_INTERNAL_SNAPPY_LIBRARY 0)
endif()
set(MISSING_INTERNAL_SNAPPY_LIBRARY 1)
endif()
if(NOT USE_INTERNAL_SNAPPY_LIBRARY)
find_library(SNAPPY_LIBRARY snappy)
find_path(SNAPPY_INCLUDE_DIR NAMES snappy.h PATHS ${SNAPPY_INCLUDE_PATHS})
endif()
if(SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR)
elseif(NOT MISSING_INTERNAL_SNAPPY_LIBRARY)
set(SNAPPY_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/snappy)
set(USE_INTERNAL_SNAPPY_LIBRARY 1)
set(SNAPPY_LIBRARY snappy)
endif()
if(SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR)
set(USE_SNAPPY 1)
endif()
message(STATUS "Using snappy=${USE_SNAPPY}: ${SNAPPY_INCLUDE_DIR} : ${SNAPPY_LIBRARY}")

View File

@ -151,17 +151,61 @@ if (USE_INTERNAL_CAPNP_LIBRARY)
endif ()
if (USE_INTERNAL_PARQUET_LIBRARY)
set (ARROW_COMPUTE ON)
set (ARROW_PARQUET ON)
set (ARROW_VERBOSE_THIRDPARTY_BUILD ON)
set (PARQUET_ARROW_LINKAGE "static")
set (ARROW_BUILD_STATIC ON)
if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
# We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time
# But this mode can be used for updating auto-generated parquet files:
# cmake -DUSE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE=1 -DUSE_STATIC_LIBRARIES=0
# copy {BUILD_DIR}/contrib/arrow/cpp/src/parquet/*.cpp,*.h -> /contrib/arrow-cmake/cpp/src/parquet/
# Also useful parquet reader:
# cd contrib/arrow/cpp/build && mkdir -p build && cmake .. -DPARQUET_BUILD_EXECUTABLES=1 && make -j8
# contrib/arrow/cpp/build/debug/parquet-reader some_file.parquet
set (ARROW_COMPUTE ON CACHE INTERNAL "")
set (ARROW_PARQUET ON CACHE INTERNAL "")
set (ARROW_VERBOSE_THIRDPARTY_BUILD ON CACHE INTERNAL "")
set (ARROW_BUILD_SHARED 1 CACHE INTERNAL "")
set (ARROW_BOOST_HEADER_ONLY ON CACHE INTERNAL "")
#set (BOOST_INCLUDEDIR Boost_INCLUDE_DIRS)
set (Boost_FOUND 1 CACHE INTERNAL "")
#set (ZLIB_HOME ${ZLIB_INCLUDE_DIR})
#set (ZLIB_FOUND 1)
if (MAKE_STATIC_LIBRARIES)
set (PARQUET_ARROW_LINKAGE "static" CACHE INTERNAL "")
set (ARROW_TEST_LINKAGE "static" CACHE INTERNAL "")
set (ARROW_BUILD_STATIC ${MAKE_STATIC_LIBRARIES} CACHE INTERNAL "")
else()
set (PARQUET_ARROW_LINKAGE "shared" CACHE INTERNAL "")
set (ARROW_TEST_LINKAGE "shared" CACHE INTERNAL "")
endif()
if(CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
set(_save_build_type ${CMAKE_BUILD_TYPE})
set(CMAKE_BUILD_TYPE RELEASE)
endif()
# Because Arrow uses CMAKE_SOURCE_DIR as a project path
# Hopefully will be fixed in https://github.com/apache/arrow/pull/2676
set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/cmake_modules")
add_subdirectory (arrow/cpp)
endif ()
if(_save_build_type)
set(CMAKE_BUILD_TYPE ${_save_build_type})
endif()
else()
if(USE_INTERNAL_SNAPPY_LIBRARY)
set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "")
if (NOT MAKE_STATIC_LIBRARIES)
set(BUILD_SHARED_LIBS 1) # TODO: set at root dir
endif()
add_subdirectory(snappy)
endif()
add_subdirectory(arrow-cmake)
endif()
endif()
if (USE_INTERNAL_POCO_LIBRARY)
set (POCO_VERBOSE_MESSAGES 0 CACHE INTERNAL "")

2
contrib/arrow vendored

@ -1 +1 @@
Subproject commit af20905877fb353367d7ee5a808f759532a5ca0f
Subproject commit 87ac6fddaf21d0b4ee8b8090533ff293db0da1b4

View File

@ -0,0 +1,212 @@
# === thrift
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/thrift/lib/cpp)
# contrib/thrift/lib/cpp/CMakeLists.txt
set(thriftcpp_SOURCES
${LIBRARY_DIR}/src/thrift/TApplicationException.cpp
${LIBRARY_DIR}/src/thrift/TOutput.cpp
${LIBRARY_DIR}/src/thrift/async/TAsyncChannel.cpp
${LIBRARY_DIR}/src/thrift/async/TAsyncProtocolProcessor.cpp
${LIBRARY_DIR}/src/thrift/async/TConcurrentClientSyncInfo.h
${LIBRARY_DIR}/src/thrift/async/TConcurrentClientSyncInfo.cpp
${LIBRARY_DIR}/src/thrift/concurrency/ThreadManager.cpp
${LIBRARY_DIR}/src/thrift/concurrency/TimerManager.cpp
${LIBRARY_DIR}/src/thrift/concurrency/Util.cpp
${LIBRARY_DIR}/src/thrift/processor/PeekProcessor.cpp
${LIBRARY_DIR}/src/thrift/protocol/TBase64Utils.cpp
${LIBRARY_DIR}/src/thrift/protocol/TDebugProtocol.cpp
${LIBRARY_DIR}/src/thrift/protocol/TJSONProtocol.cpp
${LIBRARY_DIR}/src/thrift/protocol/TMultiplexedProtocol.cpp
${LIBRARY_DIR}/src/thrift/protocol/TProtocol.cpp
${LIBRARY_DIR}/src/thrift/transport/TTransportException.cpp
${LIBRARY_DIR}/src/thrift/transport/TFDTransport.cpp
${LIBRARY_DIR}/src/thrift/transport/TSimpleFileTransport.cpp
${LIBRARY_DIR}/src/thrift/transport/THttpTransport.cpp
${LIBRARY_DIR}/src/thrift/transport/THttpClient.cpp
${LIBRARY_DIR}/src/thrift/transport/THttpServer.cpp
${LIBRARY_DIR}/src/thrift/transport/TSocket.cpp
${LIBRARY_DIR}/src/thrift/transport/TSocketPool.cpp
${LIBRARY_DIR}/src/thrift/transport/TServerSocket.cpp
${LIBRARY_DIR}/src/thrift/transport/TTransportUtils.cpp
${LIBRARY_DIR}/src/thrift/transport/TBufferTransports.cpp
${LIBRARY_DIR}/src/thrift/server/TConnectedClient.cpp
${LIBRARY_DIR}/src/thrift/server/TServerFramework.cpp
${LIBRARY_DIR}/src/thrift/server/TSimpleServer.cpp
${LIBRARY_DIR}/src/thrift/server/TThreadPoolServer.cpp
${LIBRARY_DIR}/src/thrift/server/TThreadedServer.cpp
)
set( thriftcpp_threads_SOURCES
${LIBRARY_DIR}/src/thrift/concurrency/ThreadFactory.cpp
${LIBRARY_DIR}/src/thrift/concurrency/Thread.cpp
${LIBRARY_DIR}/src/thrift/concurrency/Monitor.cpp
${LIBRARY_DIR}/src/thrift/concurrency/Mutex.cpp
)
add_library(${THRIFT_LIBRARY} ${LINK_MODE} ${thriftcpp_SOURCES} ${thriftcpp_threads_SOURCES})
set_target_properties(${THRIFT_LIBRARY} PROPERTIES CXX_STANDARD 14) # REMOVE after https://github.com/apache/thrift/pull/1641
target_include_directories(${THRIFT_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/thrift/lib/cpp/src PRIVATE ${Boost_INCLUDE_DIRS})
# === arrow
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow)
# arrow/cpp/src/arrow/CMakeLists.txt
set(ARROW_SRCS
${LIBRARY_DIR}/array.cc
${LIBRARY_DIR}/builder.cc
${LIBRARY_DIR}/array/builder_adaptive.cc
${LIBRARY_DIR}/array/builder_base.cc
${LIBRARY_DIR}/array/builder_binary.cc
${LIBRARY_DIR}/array/builder_decimal.cc
${LIBRARY_DIR}/array/builder_dict.cc
${LIBRARY_DIR}/array/builder_nested.cc
${LIBRARY_DIR}/array/builder_primitive.cc
${LIBRARY_DIR}/buffer.cc
${LIBRARY_DIR}/compare.cc
${LIBRARY_DIR}/memory_pool.cc
${LIBRARY_DIR}/pretty_print.cc
${LIBRARY_DIR}/record_batch.cc
${LIBRARY_DIR}/status.cc
${LIBRARY_DIR}/table.cc
${LIBRARY_DIR}/table_builder.cc
${LIBRARY_DIR}/tensor.cc
${LIBRARY_DIR}/sparse_tensor.cc
${LIBRARY_DIR}/type.cc
${LIBRARY_DIR}/visitor.cc
${LIBRARY_DIR}/csv/converter.cc
${LIBRARY_DIR}/csv/chunker.cc
${LIBRARY_DIR}/csv/column-builder.cc
${LIBRARY_DIR}/csv/options.cc
${LIBRARY_DIR}/csv/parser.cc
${LIBRARY_DIR}/csv/reader.cc
${LIBRARY_DIR}/io/buffered.cc
${LIBRARY_DIR}/io/compressed.cc
${LIBRARY_DIR}/io/file.cc
${LIBRARY_DIR}/io/interfaces.cc
${LIBRARY_DIR}/io/memory.cc
${LIBRARY_DIR}/io/readahead.cc
${LIBRARY_DIR}/util/bit-util.cc
${LIBRARY_DIR}/util/compression.cc
${LIBRARY_DIR}/util/cpu-info.cc
${LIBRARY_DIR}/util/decimal.cc
${LIBRARY_DIR}/util/int-util.cc
${LIBRARY_DIR}/util/io-util.cc
${LIBRARY_DIR}/util/logging.cc
${LIBRARY_DIR}/util/key_value_metadata.cc
${LIBRARY_DIR}/util/task-group.cc
${LIBRARY_DIR}/util/thread-pool.cc
${LIBRARY_DIR}/util/trie.cc
${LIBRARY_DIR}/util/utf8.cc
)
set(ARROW_SRCS ${ARROW_SRCS}
${LIBRARY_DIR}/compute/context.cc
${LIBRARY_DIR}/compute/kernels/boolean.cc
${LIBRARY_DIR}/compute/kernels/cast.cc
${LIBRARY_DIR}/compute/kernels/hash.cc
${LIBRARY_DIR}/compute/kernels/util-internal.cc
)
if (LZ4_INCLUDE_DIR AND LZ4_LIBRARY)
set(ARROW_WITH_LZ4 1)
endif()
if(SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY)
set(ARROW_WITH_SNAPPY 1)
endif()
if(ZLIB_INCLUDE_DIR AND ZLIB_LIBRARIES)
set(ARROW_WITH_ZLIB 1)
endif()
if (ZSTD_INCLUDE_DIR AND ZSTD_LIBRARY)
set(ARROW_WITH_ZSTD 1)
endif()
if (ARROW_WITH_LZ4)
add_definitions(-DARROW_WITH_LZ4)
SET(ARROW_SRCS ${LIBRARY_DIR}/util/compression_lz4.cc ${ARROW_SRCS})
endif()
if (ARROW_WITH_SNAPPY)
add_definitions(-DARROW_WITH_SNAPPY)
SET(ARROW_SRCS ${LIBRARY_DIR}/util/compression_snappy.cc ${ARROW_SRCS})
endif()
if (ARROW_WITH_ZLIB)
add_definitions(-DARROW_WITH_ZLIB)
SET(ARROW_SRCS ${LIBRARY_DIR}/util/compression_zlib.cc ${ARROW_SRCS})
endif()
if (ARROW_WITH_ZSTD)
add_definitions(-DARROW_WITH_ZSTD)
SET(ARROW_SRCS ${LIBRARY_DIR}/util/compression_zstd.cc ${ARROW_SRCS})
endif()
add_library(${ARROW_LIBRARY} ${LINK_MODE} ${ARROW_SRCS})
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src ${Boost_INCLUDE_DIRS})
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} Threads::Threads)
if (ARROW_WITH_LZ4)
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${LZ4_LIBRARY})
endif()
if (ARROW_WITH_SNAPPY)
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${SNAPPY_LIBRARY})
endif()
if (ARROW_WITH_ZLIB)
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${ZLIB_LIBRARIES})
endif()
if (ARROW_WITH_ZSTD)
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${ZSTD_LIBRARY})
endif()
# === parquet
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/parquet)
# arrow/cpp/src/parquet/CMakeLists.txt
set(PARQUET_SRCS
${LIBRARY_DIR}/arrow/reader.cc
${LIBRARY_DIR}/arrow/record_reader.cc
${LIBRARY_DIR}/arrow/schema.cc
${LIBRARY_DIR}/arrow/writer.cc
${LIBRARY_DIR}/bloom_filter.cc
${LIBRARY_DIR}/column_reader.cc
${LIBRARY_DIR}/column_scanner.cc
${LIBRARY_DIR}/column_writer.cc
${LIBRARY_DIR}/file_reader.cc
${LIBRARY_DIR}/file_writer.cc
${LIBRARY_DIR}/metadata.cc
${LIBRARY_DIR}/murmur3.cc
${LIBRARY_DIR}/printer.cc
${LIBRARY_DIR}/schema.cc
${LIBRARY_DIR}/statistics.cc
${LIBRARY_DIR}/types.cc
${LIBRARY_DIR}/util/comparison.cc
${LIBRARY_DIR}/util/memory.cc
)
#list(TRANSFORM PARQUET_SRCS PREPEND ${LIBRARY_DIR}/) # cmake 3.12
list(APPEND PARQUET_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/parquet/parquet_constants.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/parquet/parquet_types.cpp
)
add_library(${PARQUET_LIBRARY} ${LINK_MODE} ${PARQUET_SRCS})
target_include_directories(${PARQUET_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
include(${ClickHouse_SOURCE_DIR}/contrib/thrift/build/cmake/ConfigureChecks.cmake) # makes config.h
target_link_libraries(${PARQUET_LIBRARY} PRIVATE ${ARROW_LIBRARY} ${THRIFT_LIBRARY} ${Boost_REGEX_LIBRARY})
target_include_directories(${PARQUET_LIBRARY} PRIVATE ${Boost_INCLUDE_DIRS})
# === tools
set(TOOLS_DIR ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/tools/parquet)
set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan)
foreach(TOOL ${PARQUET_TOOLS})
add_executable(${TOOL} ${TOOLS_DIR}/${TOOL}.cc)
target_link_libraries(${TOOL} ${PARQUET_LIBRARY})
endforeach()

View File

@ -0,0 +1 @@
../../../thrift/build/cmake/config.h.in

View File

@ -0,0 +1,17 @@
/**
* Autogenerated by Thrift Compiler (0.11.0)
*
* DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
* @generated
*/
#include "parquet_constants.h"
namespace parquet { namespace format {
const parquetConstants g_parquet_constants;
parquetConstants::parquetConstants() {
}
}} // namespace

View File

@ -0,0 +1,24 @@
/**
* Autogenerated by Thrift Compiler (0.11.0)
*
* DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
* @generated
*/
#ifndef parquet_CONSTANTS_H
#define parquet_CONSTANTS_H
#include "parquet_types.h"
namespace parquet { namespace format {
class parquetConstants {
public:
parquetConstants();
};
extern const parquetConstants g_parquet_constants;
}} // namespace
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef PARQUET_VERSION_H
#define PARQUET_VERSION_H
// define the parquet created by version
#define CREATED_BY_VERSION "parquet-cpp version 1.5.1-SNAPSHOT"
#endif // PARQUET_VERSION_H

View File

@ -0,0 +1,11 @@
/*
Temporary hack caused by 17355425 - THRIFT-4735: Remove Qt4 build support
Fixes
../contrib/arrow-cmake/cpp/src/parquet/parquet_types.h:18:10: fatal error: thrift/stdcxx.h: No such file or directory
#include <thrift/stdcxx.h>
Delete me.
*/

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 6883b40449f378019aec792f9983ce3afc7ff16e
Subproject commit 6a96e8b59f76148eb8ad54a9d15259f8ce84c606

View File

@ -10,49 +10,30 @@
# Important boost patch: 094c18b
#
set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/boost)
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
if (NOT MSVC)
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/boost)
if(NOT MSVC)
add_definitions(-Wno-unused-variable -Wno-deprecated-declarations)
endif ()
endif()
add_library(boost_program_options_internal ${LINK_MODE}
${LIBRARY_DIR}/libs/program_options/src/cmdline.cpp
${LIBRARY_DIR}/libs/program_options/src/config_file.cpp
${LIBRARY_DIR}/libs/program_options/src/convert.cpp
${LIBRARY_DIR}/libs/program_options/src/options_description.cpp
${LIBRARY_DIR}/libs/program_options/src/parsers.cpp
${LIBRARY_DIR}/libs/program_options/src/positional_options.cpp
${LIBRARY_DIR}/libs/program_options/src/split.cpp
${LIBRARY_DIR}/libs/program_options/src/utf8_codecvt_facet.cpp
${LIBRARY_DIR}/libs/program_options/src/value_semantic.cpp
${LIBRARY_DIR}/libs/program_options/src/variables_map.cpp
${LIBRARY_DIR}/libs/program_options/src/winmain.cpp)
macro(add_boost_lib lib_name)
add_headers_and_sources(boost_${lib_name} ${LIBRARY_DIR}/libs/${lib_name}/src)
add_library(boost_${lib_name}_internal ${LINK_MODE} ${boost_${lib_name}_sources})
target_include_directories(boost_${lib_name}_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_compile_definitions(boost_${lib_name}_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
endmacro()
add_library(boost_filesystem_internal ${LINK_MODE}
${LIBRARY_DIR}/libs/filesystem/src/codecvt_error_category.cpp
${LIBRARY_DIR}/libs/filesystem/src/operations.cpp
${LIBRARY_DIR}/libs/filesystem/src/path.cpp
${LIBRARY_DIR}/libs/filesystem/src/path_traits.cpp
${LIBRARY_DIR}/libs/filesystem/src/portability.cpp
${LIBRARY_DIR}/libs/filesystem/src/unique_path.cpp
${LIBRARY_DIR}/libs/filesystem/src/utf8_codecvt_facet.cpp
${LIBRARY_DIR}/libs/filesystem/src/windows_file_codecvt.cpp)
add_boost_lib(system)
add_library(boost_system_internal ${LINK_MODE}
${LIBRARY_DIR}/libs/system/src/error_code.cpp)
add_boost_lib(program_options)
add_library(boost_random_internal ${LINK_MODE}
${LIBRARY_DIR}/libs/random/src/random_device.cpp)
add_boost_lib(filesystem)
target_link_libraries(boost_filesystem_internal PRIVATE boost_system_internal)
target_link_libraries (boost_filesystem_internal PUBLIC boost_system_internal)
#add_boost_lib(random)
target_include_directories (boost_program_options_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories (boost_filesystem_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories (boost_system_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories (boost_random_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_compile_definitions (boost_program_options_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
target_compile_definitions (boost_filesystem_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
target_compile_definitions (boost_system_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
target_compile_definitions (boost_random_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
if (USE_INTERNAL_PARQUET_LIBRARY)
add_boost_lib(regex)
endif()

View File

@ -51,10 +51,10 @@ set(SRCS
${RDKAFKA_SOURCE_DIR}/snappy.c
${RDKAFKA_SOURCE_DIR}/tinycthread.c
${RDKAFKA_SOURCE_DIR}/tinycthread_extra.c
${RDKAFKA_SOURCE_DIR}/xxhash.c
${RDKAFKA_SOURCE_DIR}/lz4.c
${RDKAFKA_SOURCE_DIR}/lz4frame.c
${RDKAFKA_SOURCE_DIR}/lz4hc.c
#${RDKAFKA_SOURCE_DIR}/xxhash.c
#${RDKAFKA_SOURCE_DIR}/lz4.c
#${RDKAFKA_SOURCE_DIR}/lz4frame.c
#${RDKAFKA_SOURCE_DIR}/lz4hc.c
${RDKAFKA_SOURCE_DIR}/rdgz.c
)

View File

@ -3,6 +3,10 @@ SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/lz4/lib)
add_library (lz4
${LIBRARY_DIR}/lz4.c
${LIBRARY_DIR}/lz4hc.c
${LIBRARY_DIR}/lz4frame.c
${LIBRARY_DIR}/lz4frame.h
${LIBRARY_DIR}/xxhash.c
${LIBRARY_DIR}/xxhash.h
${LIBRARY_DIR}/lz4.h
${LIBRARY_DIR}/lz4hc.h

1
contrib/snappy vendored Submodule

@ -0,0 +1 @@
Subproject commit 3f194acb57e0487531c96b97af61dcbd025a78a3

1
contrib/thrift vendored Submodule

@ -0,0 +1 @@
Subproject commit 010ccf0a0c7023fea0f6bf4e4078ebdff7e61982

View File

@ -295,9 +295,9 @@ if (USE_RDKAFKA)
endif ()
if (USE_PARQUET)
target_link_libraries(dbms ${PARQUET_LIBRARY} ${ARROW_LIBRARY})
if (NOT USE_INTERNAL_PARQUET_LIBRARY)
target_include_directories (dbms BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
target_link_libraries(dbms PRIVATE ${PARQUET_LIBRARY})
if (NOT USE_INTERNAL_PARQUET_LIBRARY OR USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE)
target_include_directories (dbms SYSTEM BEFORE PRIVATE ${PARQUET_INCLUDE_DIR} ${ARROW_INCLUDE_DIR})
endif ()
endif ()

View File

@ -478,7 +478,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
global_context->setFormatSchemaPath(format_schema_path.path());
format_schema_path.createDirectories();
LOG_INFO(log, "Loading metadata.");
LOG_INFO(log, "Loading metadata from " + path);
try
{
loadMetadataSystem(*global_context);

View File

@ -14,6 +14,8 @@
#cmakedefine01 USE_POCO_MONGODB
#cmakedefine01 USE_POCO_NETSSL
#cmakedefine01 USE_BASE64
#cmakedefine01 USE_SNAPPY
#cmakedefine01 USE_PARQUET
#cmakedefine01 USE_HDFS
#cmakedefine01 USE_XXHASH
#cmakedefine01 USE_INTERNAL_LLVM_LIBRARY

View File

@ -1,336 +0,0 @@
#include <algorithm>
#include <iterator>
#include <vector>
// TODO: clear includes
#include <Core/ColumnWithTypeAndName.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <common/DateLUTImpl.h>
#include <DataStreams/ParquetBlockInputStream.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeNullable.h>
#include <IO/BufferBase.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <ext/range.h>
#include <arrow/buffer.h>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
#include <IO/copyData.h>
#include <IO/WriteBufferFromString.h>
namespace DB
{
ParquetBlockInputStream::ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_)
: istr(istr_)
, header(header_)
{
}
Block ParquetBlockInputStream::getHeader() const
{
return header;
}
/// Inserts numeric data right into internal column data to reduce an overhead
template <typename NumericType>
void ParquetBlockInputStream::fillColumnWithNumericData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<NumericType> & column_data = static_cast<ColumnVector<NumericType> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
const NumericType * raw_data = reinterpret_cast<const NumericType *>(buffer->data());
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());
}
}
/// Inserts chars and offsets right into internal column data to reduce an overhead.
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
/// Also internal strings are null terminated.
void ParquetBlockInputStream::fillColumnWithStringData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt8> & column_chars_t = static_cast<ColumnString &>(*internal_column).getChars();
PaddedPODArray<UInt64> & column_offsets = static_cast<ColumnString &>(*internal_column).getOffsets();
size_t chars_t_size = 0;
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
const size_t chunk_length = chunk.length();
chars_t_size += chunk.value_offset(chunk_length - 1) + chunk.value_length(chunk_length - 1);
chars_t_size += chunk_length; /// additional space for null bytes
}
column_chars_t.reserve(chars_t_size);
column_offsets.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
std::shared_ptr<arrow::Buffer> buffer = chunk.value_data();
const size_t chunk_length = chunk.length();
for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
{
const UInt8 * raw_data = buffer->data() + chunk.value_offset(offset_i);
column_chars_t.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
column_chars_t.emplace_back('\0');
column_offsets.emplace_back(column_chars_t.size());
}
}
}
void ParquetBlockInputStream::fillColumnWithBooleanData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt8> & column_data = static_cast<ColumnVector<UInt8> &>(*internal_column).getData();
column_data.resize(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
arrow::BooleanArray & chunk = static_cast<arrow::BooleanArray &>(*(arrow_column->data()->chunk(chunk_i)));
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk.data()->buffers[1];
for (size_t bool_i = 0; bool_i != static_cast<size_t>(chunk.length()); ++bool_i)
column_data[bool_i] = chunk.Value(bool_i);
}
}
/// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving
void ParquetBlockInputStream::fillColumnWithDate32Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt16> & column_data = static_cast<ColumnVector<UInt16> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
arrow::Date32Array & chunk = static_cast<arrow::Date32Array &>(*(arrow_column->data()->chunk(chunk_i)));
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk.length()); ++value_i)
{
UInt32 days_num = static_cast<UInt32>(chunk.Value(value_i));
if (days_num > DATE_LUT_MAX_DAY_NUM)
{
// TODO: will it rollback correctly?
throw Exception(
"Input value " + std::to_string(days_num) + " of a column \"" + arrow_column->name() + "\" is greater than "
"max allowed Date value, which is " + std::to_string(DATE_LUT_MAX_DAY_NUM)
);
}
column_data.emplace_back(days_num);
}
}
}
/// Creates a null bytemap from arrow's null bitmap
void ParquetBlockInputStream::fillByteMapFromArrowColumn(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & bytemap)
{
PaddedPODArray<UInt8> & bytemap_data = static_cast<ColumnVector<UInt8> &>(*bytemap).getData();
bytemap_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk->length()); ++value_i)
bytemap_data.emplace_back(chunk->IsNull(value_i));
}
}
#define FOR_ARROW_NUMERIC_TYPES(M) \
M(arrow::Type::UINT8, UInt8) \
M(arrow::Type::INT8, Int8) \
M(arrow::Type::UINT16, UInt16) \
M(arrow::Type::INT16, Int16) \
M(arrow::Type::UINT32, UInt32) \
M(arrow::Type::INT32, Int32) \
M(arrow::Type::UINT64, UInt64) \
M(arrow::Type::INT64, Int64) \
M(arrow::Type::FLOAT, Float32) \
M(arrow::Type::DOUBLE, Float64)
using NameToColumnPtr = std::unordered_map<std::string, std::shared_ptr<arrow::Column>>;
const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> ParquetBlockInputStream::arrow_type_to_internal_type = {
{arrow::Type::UINT8, std::make_shared<DataTypeUInt8>()},
{arrow::Type::INT8, std::make_shared<DataTypeInt8>()},
{arrow::Type::UINT16, std::make_shared<DataTypeUInt16>()},
{arrow::Type::INT16, std::make_shared<DataTypeInt16>()},
{arrow::Type::UINT32, std::make_shared<DataTypeUInt32>()},
{arrow::Type::INT32, std::make_shared<DataTypeInt32>()},
{arrow::Type::UINT64, std::make_shared<DataTypeUInt64>()},
{arrow::Type::INT64, std::make_shared<DataTypeInt64>()},
{arrow::Type::FLOAT, std::make_shared<DataTypeFloat32>()},
{arrow::Type::DOUBLE, std::make_shared<DataTypeFloat64>()},
{arrow::Type::BOOL, std::make_shared<DataTypeUInt8>()},
{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
{arrow::Type::STRING, std::make_shared<DataTypeString>()}//,
// TODO: add other types that are convertable to internal ones:
// 0. ENUM?
// 1. UUID -> String
// 2. JSON -> String
};
Block ParquetBlockInputStream::readImpl()
{
Block res;
if (istr.eof())
return res;
std::string file_data;
{
WriteBufferFromString file_buffer(file_data);
copyData(istr, file_buffer);
}
arrow::Buffer buffer(file_data);
// TODO: maybe use parquet::RandomAccessSource?
auto reader = parquet::ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
parquet::arrow::FileReader filereader(::arrow::default_memory_pool(), std::move(reader));
std::shared_ptr<arrow::Table> table;
// TODO: also catch a ParquetException thrown by filereader?
arrow::Status read_status = filereader.ReadTable(&table);
if (!read_status.ok())
throw Exception("Error while reading parquet data: " + read_status.ToString()/*, ErrorCodes::TODO*/);
if (0 == table->num_rows())
throw Exception("Empty table in input data"/*, ErrorCodes::TODO*/);
if (header.columns() > static_cast<size_t>(table->num_columns()))
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
throw Exception("Number of columns is less than the table has" /*, ErrorCodes::TODO*/);
NameToColumnPtr name_to_column_ptr;
for (size_t i = 0; i != static_cast<size_t>(table->num_columns()); ++i)
{
std::shared_ptr<arrow::Column> arrow_column = table->column(i);
name_to_column_ptr[arrow_column->name()] = arrow_column;
}
for (size_t column_i = 0; column_i != header.columns(); ++column_i)
{
ColumnWithTypeAndName header_column = header.getByPosition(column_i);
if (name_to_column_ptr.find(header_column.name) == name_to_column_ptr.end())
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
throw Exception("Column \"" + header_column.name + "\" is not presented in input data" /*, ErrorCodes::TODO*/);
std::shared_ptr<arrow::Column> arrow_column = name_to_column_ptr[header_column.name];
arrow::Type::type arrow_type = arrow_column->type()->id();
if (arrow_type_to_internal_type.find(arrow_type) == arrow_type_to_internal_type.end())
{
throw Exception(
"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name() + "\""
" is not supported for conversion from a Parquet data format"
/*, ErrorCodes::TODO*/
);
}
// TODO: check if a column is const?
if (!header_column.type->isNullable() && arrow_column->null_count())
{
throw Exception("Can not insert NULL data into non-nullable column \"" + header_column.name + "\""/*, ErrorCodes::TODO*/);
}
const bool target_column_is_nullable = header_column.type->isNullable() || arrow_column->null_count();
const DataTypePtr internal_nested_type = arrow_type_to_internal_type.at(arrow_type);
const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type) : internal_nested_type;
const std::string internal_nested_type_name = internal_nested_type->getName();
const DataTypePtr column_nested_type =
header_column.type->isNullable()
? static_cast<const DataTypeNullable *>(header_column.type.get())->getNestedType()
: header_column.type;
const DataTypePtr column_type = header_column.type;
const std::string column_nested_type_name = column_nested_type->getName();
// TODO: can it be done with typeid_cast?
if (internal_nested_type_name != column_nested_type_name)
{
throw Exception(
"Input data type \"" + internal_nested_type_name + "\" for a column \"" + header_column.name + "\""
" is not compatible with a column type \"" + column_nested_type_name + "\""/*, ErrorCodes::TODO*/
);
}
ColumnWithTypeAndName column;
column.name = header_column.name;
column.type = internal_type;
/// Data
MutableColumnPtr read_column = internal_nested_type->createColumn();
switch (arrow_type)
{
case arrow::Type::STRING:
fillColumnWithStringData(arrow_column, read_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, read_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, read_column);
break;
#define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception("Unsupported parquet type \"" + arrow_column->type()->name() + "\""/*, ErrorCodes::TODO*/);
}
if (column.type->isNullable())
{
MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn();
fillByteMapFromArrowColumn(arrow_column, null_bytemap);
column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap));
}
else
{
column.column = std::move(read_column);
}
res.insert(std::move(column));
}
return res;
}
}

View File

@ -1,43 +0,0 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <DataStreams/IProfilingBlockInputStream.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeDate.h>
// TODO: refine includes
#include <arrow/api.h>
namespace DB
{
class ParquetBlockInputStream : public IProfilingBlockInputStream
{
public:
ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_);
String getName() const override { return "Parquet"; }
Block getHeader() const override;
protected:
Block readImpl() override;
private:
ReadBuffer & istr;
Block header;
static void fillColumnWithStringData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column);
static void fillColumnWithBooleanData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column);
static void fillColumnWithDate32Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column);
template <typename NumericType>
static void fillColumnWithNumericData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column);
static void fillByteMapFromArrowColumn(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & bytemap);
static const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> arrow_type_to_internal_type;
// TODO: check that this class implements every part of its parent
};
}

View File

@ -1,271 +0,0 @@
// TODO: clean includes
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <Core/ColumnWithTypeAndName.h>
#include <DataTypes/DataTypeNullable.h>
#include <IO/WriteHelpers.h>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/writer.h>
#include <parquet/util/memory.h>
#include <parquet/exception.h>
#include <DataStreams/ParquetBlockOutputStream.h>
namespace DB
{
ParquetBlockOutputStream::ParquetBlockOutputStream(WriteBuffer & ostr_, const Block & header_)
: ostr(ostr_)
, header(header_)
{
}
void ParquetBlockOutputStream::flush()
{
ostr.next();
}
void checkAppendStatus(arrow::Status & append_status, const std::string & column_name)
{
if (!append_status.ok())
{
throw Exception(
"Error while building a parquet column \"" + column_name + "\": " + append_status.ToString()/*,
ErrorCodes::TODO*/
);
}
}
void checkFinishStatus(arrow::Status & finish_status, const std::string & column_name)
{
if (!finish_status.ok())
{
throw Exception(
"Error while writing a parquet column \"" + column_name + "\": " + finish_status.ToString()/*,
ErrorCodes::TODO*/
);
}
}
template <typename NumericType, typename ArrowBuilderType>
void ParquetBlockOutputStream::fillArrowArrayWithNumericColumnData(
ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap
) {
const PaddedPODArray<NumericType> & internal_data = static_cast<const ColumnVector<NumericType> &>(*write_column).getData();
ArrowBuilderType numeric_builder;
arrow::Status append_status;
const UInt8 * arrow_null_bytemap_raw_ptr = nullptr;
PaddedPODArray<UInt8> arrow_null_bytemap;
if (null_bytemap)
{
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(null_bytemap->size());
for (size_t i = 0; i != null_bytemap->size(); ++i)
arrow_null_bytemap.emplace_back(1 ^ (*null_bytemap)[i]);
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
}
append_status = numeric_builder.AppendValues(internal_data.data(), internal_data.size(), arrow_null_bytemap_raw_ptr);
checkAppendStatus(append_status, write_column->getName());
arrow::Status finish_status = numeric_builder.Finish(&arrow_array);
checkFinishStatus(finish_status, write_column->getName());
}
void ParquetBlockOutputStream::fillArrowArrayWithStringColumnData(
ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap
) {
const ColumnString & internal_column = static_cast<const ColumnString &>(*write_column);
arrow::StringBuilder string_builder;
arrow::Status append_status;
for (size_t string_i = 0; string_i != internal_column.size(); ++string_i)
{
if (null_bytemap && (*null_bytemap)[string_i])
{
append_status = string_builder.AppendNull();
}
else
{
StringRef string_ref = internal_column.getDataAt(string_i);
append_status = string_builder.Append(string_ref.data, string_ref.size);
}
checkAppendStatus(append_status, write_column->getName());
}
arrow::Status finish_status = string_builder.Finish(&arrow_array);
checkFinishStatus(finish_status, write_column->getName());
}
void ParquetBlockOutputStream::fillArrowArrayWithDateColumnData(
ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap
) {
const PaddedPODArray<UInt16> & internal_data = static_cast<const ColumnVector<UInt16> &>(*write_column).getData();
arrow::Date32Builder date32_builder;
arrow::Status append_status;
for (size_t value_i = 0; value_i != internal_data.size(); ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
append_status = date32_builder.AppendNull();
else
/// Implicitly converts UInt16 to Int32
append_status = date32_builder.Append(internal_data[value_i]);
checkAppendStatus(append_status, write_column->getName());
}
arrow::Status finish_status = date32_builder.Finish(&arrow_array);
checkFinishStatus(finish_status, write_column->getName());
}
#define FOR_INTERNAL_NUMERIC_TYPES(M) \
M(UInt8, arrow::UInt8Builder) \
M(Int8, arrow::Int8Builder) \
M(UInt16, arrow::UInt16Builder) \
M(Int16, arrow::Int16Builder) \
M(UInt32, arrow::UInt32Builder) \
M(Int32, arrow::Int32Builder) \
M(UInt64, arrow::UInt64Builder) \
M(Int64, arrow::Int64Builder) \
M(Float32, arrow::FloatBuilder) \
M(Float64, arrow::DoubleBuilder)
const std::unordered_map<String, std::shared_ptr<arrow::DataType>> ParquetBlockOutputStream::internal_type_to_arrow_type = {
{"UInt8", arrow::uint8()},
{"Int8", arrow::int8()},
{"UInt16", arrow::uint16()},
{"Int16", arrow::int16()},
{"UInt32", arrow::uint32()},
{"Int32", arrow::int32()},
{"UInt64", arrow::uint64()},
{"Int64", arrow::int64()},
{"Float32", arrow::float32()},
{"Float64", arrow::float64()},
{"Date", arrow::date32()},
// TODO: ClickHouse can actually store non-utf8 strings!
{"String", arrow::utf8()}//,
// TODO: add other types:
// 1. FixedString
// 2. DateTime
};
const PaddedPODArray<UInt8> * extractNullBytemapPtr(ColumnPtr column)
{
ColumnPtr null_column = static_cast<const ColumnNullable &>(*column).getNullMapColumnPtr();
const PaddedPODArray<UInt8> & null_bytemap = static_cast<const ColumnVector<UInt8> &>(*null_column).getData();
return &null_bytemap;
}
void ParquetBlockOutputStream::write(const Block & block)
{
block.checkNumberOfRows();
const size_t columns_num = block.columns();
/// For arrow::Schema and arrow::Table creation
std::vector<std::shared_ptr<arrow::Field>> arrow_fields;
std::vector<std::shared_ptr<arrow::Array>> arrow_arrays;
arrow_fields.reserve(columns_num);
arrow_arrays.reserve(columns_num);
for (size_t column_i = 0; column_i < columns_num; ++column_i)
{
// TODO: constructed every iteration
const ColumnWithTypeAndName & column = block.safeGetByPosition(column_i);
const bool is_column_nullable = column.type->isNullable();
const DataTypePtr column_nested_type =
is_column_nullable
? static_cast<const DataTypeNullable *>(column.type.get())->getNestedType()
: column.type;
const DataTypePtr column_type = column.type;
// TODO: do not mix std::string and String
const std::string column_nested_type_name = column_nested_type->getName();
if (internal_type_to_arrow_type.find(column_nested_type_name) == internal_type_to_arrow_type.end())
{
throw Exception(
"The type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\""
" is not supported for conversion into a Parquet data format"
/*, ErrorCodes::TODO*/
);
}
arrow_fields.emplace_back(new arrow::Field(
column.name,
internal_type_to_arrow_type.at(column_nested_type_name),
is_column_nullable
));
std::shared_ptr<arrow::Array> arrow_array;
ColumnPtr nested_column = is_column_nullable ? static_cast<const ColumnNullable &>(*column.column).getNestedColumnPtr() : column.column;
const PaddedPODArray<UInt8> * null_bytemap = is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr;
// TODO: use typeid_cast
if ("String" == column_nested_type_name)
{
fillArrowArrayWithStringColumnData(nested_column, arrow_array, null_bytemap);
}
else if ("Date" == column_nested_type_name)
{
fillArrowArrayWithDateColumnData(nested_column, arrow_array, null_bytemap);
}
#define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(nested_column, arrow_array, null_bytemap); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
// TODO: there are also internal types that are convertable to parquet/arrow once:
// 1. FixedString(N)
// 2. DateTime
else
{
throw Exception(
"Internal type \"" + column_nested_type_name + "\" of a column \"" + column.name + "\""
" is not supported for conversion into a Parquet data format"/*, ErrorCodes::TODO*/
);
}
arrow_arrays.emplace_back(std::move(arrow_array));
}
std::shared_ptr<arrow::Schema> arrow_schema = std::make_shared<arrow::Schema>(std::move(arrow_fields));
std::shared_ptr<arrow::Table> arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays);
// TODO: get rid of extra copying
std::shared_ptr<parquet::InMemoryOutputStream> sink = std::make_shared<parquet::InMemoryOutputStream>();
// TODO: calculate row_group_size depending on a number of rows and table size
arrow::Status write_status = parquet::arrow::WriteTable(
*arrow_table, arrow::default_memory_pool(), sink,
/* row_group_size = */arrow_table->num_rows(), parquet::default_writer_properties(),
parquet::arrow::default_arrow_writer_properties()
);
if (!write_status.ok())
throw Exception("Error while writing a table: " + write_status.ToString()/*, ErrorCodes::TODO*/);
std::shared_ptr<arrow::Buffer> table_buffer = sink->GetBuffer();
writeString(reinterpret_cast<const char *>(table_buffer->data()), table_buffer->size(), ostr);
}
};

View File

@ -1,36 +0,0 @@
#pragma once
#include <DataStreams/IBlockOutputStream.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{
class ParquetBlockOutputStream : public IBlockOutputStream
{
public:
ParquetBlockOutputStream(WriteBuffer & ostr_, const Block & header_);
Block getHeader() const override { return header; }
void write(const Block & block) override;
void flush() override;
String getContentType() const override { return "application/octet-stream"; }
private:
WriteBuffer & ostr;
Block header;
static void fillArrowArrayWithDateColumnData(ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap);
static void fillArrowArrayWithStringColumnData(ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap);
template <typename NumericType, typename ArrowBuilderType>
static void fillArrowArrayWithNumericColumnData(ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap);
static const std::unordered_map<String, std::shared_ptr<arrow::DataType>> internal_type_to_arrow_type;
};
}

View File

@ -80,9 +80,9 @@ public:
scale(scale_)
{
if (unlikely(precision < 1 || precision > maxPrecision()))
throw Exception("Precision is out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
throw Exception("Precision " + std::to_string(precision) + " is out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (unlikely(scale < 0 || static_cast<UInt32>(scale) > maxPrecision()))
throw Exception("Scale is out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
throw Exception("Scale " + std::to_string(scale) + " is out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
}
const char * getFamilyName() const override { return "Decimal"; }

View File

@ -69,6 +69,7 @@ BlockOutputStreamPtr FormatFactory::getOutput(const String & name, WriteBuffer &
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.write_statistics = settings.output_format_write_statistics;
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
/** Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns.
@ -111,6 +112,8 @@ void registerInputFormatTSKV(FormatFactory & factory);
void registerOutputFormatTSKV(FormatFactory & factory);
void registerInputFormatJSONEachRow(FormatFactory & factory);
void registerOutputFormatJSONEachRow(FormatFactory & factory);
void registerInputFormatParquet(FormatFactory & factory);
void registerOutputFormatParquet(FormatFactory & factory);
void registerOutputFormatProtobuf(FormatFactory & factory);
/// Output only (presentational) formats.
@ -149,6 +152,8 @@ FormatFactory::FormatFactory()
registerOutputFormatJSONEachRow(*this);
registerOutputFormatProtobuf(*this);
registerInputFormatCapnProto(*this);
registerInputFormatParquet(*this);
registerOutputFormatParquet(*this);
registerOutputFormatPretty(*this);
registerOutputFormatPrettyCompact(*this);

View File

@ -61,6 +61,12 @@ struct FormatSettings
UInt64 input_allow_errors_num = 0;
Float32 input_allow_errors_ratio = 0;
struct Parquet
{
UInt64 row_group_size = 1000000;
} parquet;
};
}

View File

@ -4,10 +4,6 @@
#include <IO/WriteBuffer.h>
#include <IO/WriteHelpers.h>
#include <Core/iostream_debug_helpers.h>
namespace DB
{
ODBCDriver2BlockOutputStream::ODBCDriver2BlockOutputStream(

View File

@ -0,0 +1,497 @@
#include <Common/config.h>
#if USE_PARQUET
# include "ParquetBlockInputStream.h"
# include <algorithm>
# include <iterator>
# include <vector>
// TODO: clear includes
# include <Columns/ColumnNullable.h>
# include <Columns/ColumnString.h>
# include <Columns/ColumnsNumber.h>
# include <Columns/IColumn.h>
# include <Core/ColumnWithTypeAndName.h>
# include <DataTypes/DataTypeDate.h>
# include <DataTypes/DataTypeDateTime.h>
# include <DataTypes/DataTypeFactory.h>
# include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypeString.h>
# include <DataTypes/DataTypesDecimal.h>
# include <DataTypes/DataTypesNumber.h>
# include <Formats/FormatFactory.h>
# include <IO/BufferBase.h>
# include <IO/ReadBufferFromMemory.h>
# include <IO/WriteBufferFromString.h>
# include <IO/WriteHelpers.h>
# include <IO/copyData.h>
# include <Interpreters/castColumn.h>
# include <common/DateLUTImpl.h>
# include <ext/range.h>
# include <arrow/api.h>
//# include <arrow/buffer.h>
//# include <arrow/io/api.h>
# include <parquet/arrow/reader.h>
//# include <parquet/arrow/writer.h>
//# include <parquet/exception.h>
# include <parquet/file_reader.h>
# include <Core/iostream_debug_helpers.h> // REMOVE ME
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_TYPE;
extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE;
extern const int CANNOT_READ_ALL_DATA;
extern const int EMPTY_DATA_PASSED;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int CANNOT_CONVERT_TYPE;
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
extern const int THERE_IS_NO_COLUMN;
}
ParquetBlockInputStream::ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_)
: istr{istr_}, header{header_}, context{context_}
{
}
Block ParquetBlockInputStream::getHeader() const
{
return header;
}
/// Inserts numeric data right into internal column data to reduce an overhead
template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
void fillColumnWithNumericData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = static_cast<VectorType &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
const auto * raw_data = reinterpret_cast<const NumericType *>(buffer->data());
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());
}
}
/// Inserts chars and offsets right into internal column data to reduce an overhead.
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
/// Also internal strings are null terminated.
void fillColumnWithStringData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt8> & column_chars_t = static_cast<ColumnString &>(*internal_column).getChars();
PaddedPODArray<UInt64> & column_offsets = static_cast<ColumnString &>(*internal_column).getOffsets();
size_t chars_t_size = 0;
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
const size_t chunk_length = chunk.length();
chars_t_size += chunk.value_offset(chunk_length - 1) + chunk.value_length(chunk_length - 1);
chars_t_size += chunk_length; /// additional space for null bytes
}
column_chars_t.reserve(chars_t_size);
column_offsets.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
std::shared_ptr<arrow::Buffer> buffer = chunk.value_data();
const size_t chunk_length = chunk.length();
for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
{
if (!chunk.IsNull(offset_i) && buffer)
{
const UInt8 * raw_data = buffer->data() + chunk.value_offset(offset_i);
column_chars_t.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
}
column_chars_t.emplace_back('\0');
column_offsets.emplace_back(column_chars_t.size());
}
}
}
void fillColumnWithBooleanData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = static_cast<ColumnVector<UInt8> &>(*internal_column).getData();
column_data.resize(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::BooleanArray & chunk = static_cast<arrow::BooleanArray &>(*(arrow_column->data()->chunk(chunk_i)));
/// buffers[0] is a null bitmap and buffers[1] are actual values
std::shared_ptr<arrow::Buffer> buffer = chunk.data()->buffers[1];
for (size_t bool_i = 0; bool_i != static_cast<size_t>(chunk.length()); ++bool_i)
column_data[bool_i] = chunk.Value(bool_i);
}
}
/// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving
void fillColumnWithDate32Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
PaddedPODArray<UInt16> & column_data = static_cast<ColumnVector<UInt16> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
arrow::Date32Array & chunk = static_cast<arrow::Date32Array &>(*(arrow_column->data()->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
UInt32 days_num = static_cast<UInt32>(chunk.Value(value_i));
if (days_num > DATE_LUT_MAX_DAY_NUM)
{
// TODO: will it rollback correctly?
throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + arrow_column->name()
+ "\" is greater than "
"max allowed Date value, which is "
+ std::to_string(DATE_LUT_MAX_DAY_NUM),
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
}
column_data.emplace_back(days_num);
}
}
}
/// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving
void fillColumnWithDate64Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = static_cast<ColumnVector<UInt32> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = static_cast<arrow::Date64Array &>(*(arrow_column->data()->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / 1000); // Always? in ms
column_data.emplace_back(timestamp);
}
}
}
void fillColumnWithTimestampData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column_data = static_cast<ColumnVector<UInt32> &>(*internal_column).getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = static_cast<arrow::TimestampArray &>(*(arrow_column->data()->chunk(chunk_i)));
const auto & type = static_cast<const ::arrow::TimestampType &>(*chunk.type());
UInt32 divide = 1;
const auto unit = type.unit();
switch (unit)
{
case arrow::TimeUnit::SECOND:
divide = 1;
break;
case arrow::TimeUnit::MILLI:
divide = 1000;
break;
case arrow::TimeUnit::MICRO:
divide = 1000000;
break;
case arrow::TimeUnit::NANO:
divide = 1000000000;
break;
}
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / divide); // ms! TODO: check other 's' 'ns' ...
column_data.emplace_back(timestamp);
}
}
}
void fillColumnWithDecimalData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
{
auto & column = static_cast<ColumnDecimal<Decimal128> &>(*internal_column);
auto & column_data = column.getData();
column_data.reserve(arrow_column->length());
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
{
auto & chunk = static_cast<arrow::DecimalArray &>(*(arrow_column->data()->chunk(chunk_i)));
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
{
column_data.emplace_back(chunk.IsNull(value_i) ? Decimal128(0) : *reinterpret_cast<const Decimal128 *>(chunk.Value(value_i))); // TODO: copy column
}
}
}
/// Creates a null bytemap from arrow's null bitmap
void fillByteMapFromArrowColumn(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & bytemap)
{
PaddedPODArray<UInt8> & bytemap_data = static_cast<ColumnVector<UInt8> &>(*bytemap).getData();
bytemap_data.reserve(arrow_column->length());
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
{
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk->length()); ++value_i)
bytemap_data.emplace_back(chunk->IsNull(value_i));
}
}
# define FOR_ARROW_NUMERIC_TYPES(M) \
M(arrow::Type::UINT8, UInt8) \
M(arrow::Type::INT8, Int8) \
M(arrow::Type::UINT16, UInt16) \
M(arrow::Type::INT16, Int16) \
M(arrow::Type::UINT32, UInt32) \
M(arrow::Type::INT32, Int32) \
M(arrow::Type::UINT64, UInt64) \
M(arrow::Type::INT64, Int64) \
M(arrow::Type::FLOAT, Float32) \
M(arrow::Type::DOUBLE, Float64)
//M(arrow::Type::HALF_FLOAT, Float32) // TODO
using NameToColumnPtr = std::unordered_map<std::string, std::shared_ptr<arrow::Column>>;
const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> arrow_type_to_internal_type = {
//{arrow::Type::DECIMAL, std::make_shared<DataTypeDecimal>()},
{arrow::Type::UINT8, std::make_shared<DataTypeUInt8>()},
{arrow::Type::INT8, std::make_shared<DataTypeInt8>()},
{arrow::Type::UINT16, std::make_shared<DataTypeUInt16>()},
{arrow::Type::INT16, std::make_shared<DataTypeInt16>()},
{arrow::Type::UINT32, std::make_shared<DataTypeUInt32>()},
{arrow::Type::INT32, std::make_shared<DataTypeInt32>()},
{arrow::Type::UINT64, std::make_shared<DataTypeUInt64>()},
{arrow::Type::INT64, std::make_shared<DataTypeInt64>()},
{arrow::Type::HALF_FLOAT, std::make_shared<DataTypeFloat32>()},
{arrow::Type::FLOAT, std::make_shared<DataTypeFloat32>()},
{arrow::Type::DOUBLE, std::make_shared<DataTypeFloat64>()},
{arrow::Type::BOOL, std::make_shared<DataTypeUInt8>()},
//{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
//{arrow::Type::DATE32, std::make_shared<DataTypeDateTime>()},
{arrow::Type::DATE64, std::make_shared<DataTypeDateTime>()},
{arrow::Type::TIMESTAMP, std::make_shared<DataTypeDateTime>()},
//{arrow::Type::TIME32, std::make_shared<DataTypeDateTime>()},
{arrow::Type::STRING, std::make_shared<DataTypeString>()},
{arrow::Type::BINARY, std::make_shared<DataTypeString>()},
//{arrow::Type::FIXED_SIZE_BINARY, std::make_shared<DataTypeString>()},
//{arrow::Type::UUID, std::make_shared<DataTypeString>()},
// TODO: add other types that are convertable to internal ones:
// 0. ENUM?
// 1. UUID -> String
// 2. JSON -> String
// Full list of types: contrib/arrow/cpp/src/arrow/type.h
};
Block ParquetBlockInputStream::readImpl()
{
Block res;
if (!istr.eof())
{
/*
First we load whole stream into string (its very bad and limiting .parquet file size to half? of RAM)
Then producing blocks for every row_group (dont load big .parquet files with one row_group - it can eat x10+ RAM from .parquet file size)
*/
if (row_group_current < row_group_total)
throw Exception{"Got new data, but data from previous chunks not readed " + std::to_string(row_group_current) + "/" + std::to_string(row_group_total), ErrorCodes::CANNOT_READ_ALL_DATA};
file_data.clear();
{
WriteBufferFromString file_buffer(file_data);
copyData(istr, file_buffer);
}
buffer = std::make_unique<arrow::Buffer>(file_data);
// TODO: maybe use parquet::RandomAccessSource?
auto reader = parquet::ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(*buffer));
file_reader = std::make_unique<parquet::arrow::FileReader>(::arrow::default_memory_pool(), std::move(reader));
row_group_total = file_reader->num_row_groups();
row_group_current = 0;
}
//DUMP(row_group_current, row_group_total);
if (row_group_current >= row_group_total)
return res;
// TODO: also catch a ParquetException thrown by filereader?
//arrow::Status read_status = filereader.ReadTable(&table);
std::shared_ptr<arrow::Table> table;
arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, &table);
if (!read_status.ok())
throw Exception{"Error while reading parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
if (0 == table->num_rows())
throw Exception{"Empty table in input data", ErrorCodes::EMPTY_DATA_PASSED};
if (header.columns() > static_cast<size_t>(table->num_columns()))
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
throw Exception{"Number of columns is less than the table has", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH};
++row_group_current;
NameToColumnPtr name_to_column_ptr;
for (size_t i = 0, num_columns = static_cast<size_t>(table->num_columns()); i < num_columns; ++i)
{
std::shared_ptr<arrow::Column> arrow_column = table->column(i);
name_to_column_ptr[arrow_column->name()] = arrow_column;
}
for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i)
{
ColumnWithTypeAndName header_column = header.getByPosition(column_i);
if (name_to_column_ptr.find(header_column.name) == name_to_column_ptr.end())
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
throw Exception{"Column \"" + header_column.name + "\" is not presented in input data", ErrorCodes::THERE_IS_NO_COLUMN};
std::shared_ptr<arrow::Column> arrow_column = name_to_column_ptr[header_column.name];
arrow::Type::type arrow_type = arrow_column->type()->id();
// TODO: check if a column is const?
if (!header_column.type->isNullable() && arrow_column->null_count())
{
throw Exception{"Can not insert NULL data into non-nullable column \"" + header_column.name + "\"",
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN};
}
const bool target_column_is_nullable = header_column.type->isNullable() || arrow_column->null_count();
DataTypePtr internal_nested_type;
if (arrow_type == arrow::Type::DECIMAL)
{
const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(), decimal_type->scale());
}
else if (arrow_type_to_internal_type.find(arrow_type) != arrow_type_to_internal_type.end())
{
internal_nested_type = arrow_type_to_internal_type.at(arrow_type);
}
else
{
throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
+ "\" is not supported for conversion from a Parquet data format",
ErrorCodes::CANNOT_CONVERT_TYPE};
}
const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type) : internal_nested_type;
const std::string internal_nested_type_name = internal_nested_type->getName();
const DataTypePtr column_nested_type = header_column.type->isNullable()
? static_cast<const DataTypeNullable *>(header_column.type.get())->getNestedType()
: header_column.type;
const DataTypePtr column_type = header_column.type;
const std::string column_nested_type_name = column_nested_type->getName();
ColumnWithTypeAndName column;
column.name = header_column.name;
column.type = internal_type;
/// Data
MutableColumnPtr read_column = internal_nested_type->createColumn();
switch (arrow_type)
{
case arrow::Type::STRING:
case arrow::Type::BINARY:
//case arrow::Type::FIXED_SIZE_BINARY:
fillColumnWithStringData(arrow_column, read_column);
break;
case arrow::Type::BOOL:
fillColumnWithBooleanData(arrow_column, read_column);
break;
case arrow::Type::DATE32:
fillColumnWithDate32Data(arrow_column, read_column);
break;
case arrow::Type::DATE64:
fillColumnWithDate64Data(arrow_column, read_column);
break;
case arrow::Type::TIMESTAMP:
fillColumnWithTimestampData(arrow_column, read_column);
break;
case arrow::Type::DECIMAL:
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
fillColumnWithDecimalData(arrow_column, read_column /*, internal_nested_type*/);
break;
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
case ARROW_NUMERIC_TYPE: \
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
break;
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
// TODO: read JSON as a string?
// TODO: read UUID as a string?
default:
throw Exception{"Unsupported parquet type \"" + arrow_column->type()->name() + "\" of an input column \""
+ arrow_column->name() + "\"",
ErrorCodes::UNKNOWN_TYPE};
}
if (column.type->isNullable())
{
MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn();
fillByteMapFromArrowColumn(arrow_column, null_bytemap);
column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap));
}
else
{
column.column = std::move(read_column);
}
column.column = castColumn(column, column_type, context);
column.type = column_type;
res.insert(std::move(column));
}
return res;
}
void registerInputFormatParquet(FormatFactory & factory)
{
factory.registerInputFormat(
"Parquet",
[](ReadBuffer & buf,
const Block & sample,
const Context & context,
size_t /*max_block_size */,
const FormatSettings & /* settings */) { return std::make_shared<ParquetBlockInputStream>(buf, sample, context); });
}
}
#else
namespace DB
{
class FormatFactory;
void registerInputFormatParquet(FormatFactory &)
{
}
}
#endif

View File

@ -0,0 +1,46 @@
#pragma once
#include <Common/config.h>
#if USE_PARQUET
# include <DataStreams/IBlockInputStream.h>
//# include <parquet/file_reader.h>
//# include <parquet/arrow/reader.h>
//# include <arrow/buffer.h>
namespace parquet { namespace arrow { class FileReader; } }
namespace arrow { class Buffer; }
namespace DB
{
class Context;
class ParquetBlockInputStream : public IBlockInputStream
{
public:
ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_);
String getName() const override { return "Parquet"; }
Block getHeader() const override;
protected:
Block readImpl() override;
private:
ReadBuffer & istr;
Block header;
// TODO: check that this class implements every part of its parent
const Context & context;
std::unique_ptr<parquet::arrow::FileReader> file_reader;
std::string file_data;
std::unique_ptr<arrow::Buffer> buffer;
int row_group_total = 0;
int row_group_current = 0;
};
}
#endif

View File

@ -0,0 +1,453 @@
#include <Common/config.h>
#if USE_PARQUET
# include "ParquetBlockOutputStream.h"
// TODO: clean includes
# include <Columns/ColumnDecimal.h>
# include <Columns/ColumnFixedString.h>
# include <Columns/ColumnNullable.h>
# include <Columns/ColumnString.h>
# include <Columns/ColumnVector.h>
# include <Columns/ColumnsNumber.h>
# include <Core/ColumnWithTypeAndName.h>
# include <Core/callOnTypeIndex.h>
# include <DataTypes/DataTypeDateTime.h>
# include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypesDecimal.h>
# include <DataStreams/SquashingBlockOutputStream.h>
# include <Formats/FormatFactory.h>
# include <IO/WriteHelpers.h>
# include <arrow/api.h>
# include <arrow/io/api.h>
# include <arrow/util/decimal.h>
# include <parquet/arrow/writer.h>
# include <parquet/exception.h>
# include <parquet/util/memory.h>
# include <Core/iostream_debug_helpers.h> // REMOVE ME
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_EXCEPTION;
extern const int UNKNOWN_TYPE;
}
ParquetBlockOutputStream::ParquetBlockOutputStream(WriteBuffer & ostr, const Block & header, const FormatSettings & format_settings) : ostr{ostr}, header{header}, format_settings{format_settings}
{
}
void ParquetBlockOutputStream::flush()
{
ostr.next();
}
void checkStatus(arrow::Status & status, const std::string & column_name)
{
if (!status.ok())
throw Exception{"Error with a parquet column \"" + column_name + "\": " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
template <typename NumericType, typename ArrowBuilderType>
void fillArrowArrayWithNumericColumnData(
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
{
const PaddedPODArray<NumericType> & internal_data = static_cast<const ColumnVector<NumericType> &>(*write_column).getData();
ArrowBuilderType builder;
arrow::Status status;
const UInt8 * arrow_null_bytemap_raw_ptr = nullptr;
PaddedPODArray<UInt8> arrow_null_bytemap;
if (null_bytemap)
{
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(null_bytemap->size());
for (size_t i = 0, size = null_bytemap->size(); i < size; ++i)
arrow_null_bytemap.emplace_back(1 ^ (*null_bytemap)[i]);
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
}
status = builder.AppendValues(internal_data.data(), internal_data.size(), arrow_null_bytemap_raw_ptr);
checkStatus(status, write_column->getName());
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
}
template <typename ColumnType>
void fillArrowArrayWithStringColumnData(
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
{
const auto & internal_column = static_cast<const ColumnType &>(*write_column);
arrow::StringBuilder builder;
arrow::Status status;
for (size_t string_i = 0, size = internal_column.size(); string_i < size; ++string_i)
{
if (null_bytemap && (*null_bytemap)[string_i])
{
status = builder.AppendNull();
}
else
{
StringRef string_ref = internal_column.getDataAt(string_i);
status = builder.Append(string_ref.data, string_ref.size);
}
checkStatus(status, write_column->getName());
}
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
}
void fillArrowArrayWithDateColumnData(
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
{
const PaddedPODArray<UInt16> & internal_data = static_cast<const ColumnVector<UInt16> &>(*write_column).getData();
//arrow::Date32Builder date_builder;
arrow::UInt16Builder builder;
arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
else
/// Implicitly converts UInt16 to Int32
status = builder.Append(internal_data[value_i]);
checkStatus(status, write_column->getName());
}
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
}
void fillArrowArrayWithDateTimeColumnData(
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
{
auto & internal_data = static_cast<const ColumnVector<UInt32> &>(*write_column).getData();
//arrow::Date64Builder builder;
arrow::UInt32Builder builder;
arrow::Status status;
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
else
/// Implicitly converts UInt16 to Int32
//status = date_builder.Append(static_cast<int64_t>(internal_data[value_i]) * 1000); // now ms. TODO check other units
status = builder.Append(internal_data[value_i]);
checkStatus(status, write_column->getName());
}
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
}
template <typename DataType>
void fillArrowArrayWithDecimalColumnData(
ColumnPtr write_column,
std::shared_ptr<arrow::Array> & arrow_array,
const PaddedPODArray<UInt8> * null_bytemap,
const DataType * decimal_type)
{
const auto & column = static_cast<const typename DataType::ColumnType &>(*write_column);
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()));
arrow::Status status;
for (size_t value_i = 0, size = column.size(); value_i < size; ++value_i)
{
if (null_bytemap && (*null_bytemap)[value_i])
status = builder.AppendNull();
else
status = builder.Append(
arrow::Decimal128(reinterpret_cast<const uint8_t *>(&column.getElement(value_i).value))); // TODO: try copy column
checkStatus(status, write_column->getName());
}
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
/* TODO column copy
const auto & internal_data = static_cast<const typename DataType::ColumnType &>(*write_column).getData();
//ArrowBuilderType numeric_builder;
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()));
arrow::Status status;
const uint8_t * arrow_null_bytemap_raw_ptr = nullptr;
PaddedPODArray<UInt8> arrow_null_bytemap;
if (null_bytemap)
{
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
arrow_null_bytemap.reserve(null_bytemap->size());
for (size_t i = 0, size = null_bytemap->size(); i < size; ++i)
arrow_null_bytemap.emplace_back(1 ^ (*null_bytemap)[i]);
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
}
status = builder.AppendValues(reinterpret_cast<const uint8_t*>(internal_data.data()), internal_data.size(), arrow_null_bytemap_raw_ptr);
checkStatus(status, write_column->getName());
status = builder.Finish(&arrow_array);
checkStatus(status, write_column->getName());
*/
}
# define FOR_INTERNAL_NUMERIC_TYPES(M) \
M(UInt8, arrow::UInt8Builder) \
M(Int8, arrow::Int8Builder) \
M(UInt16, arrow::UInt16Builder) \
M(Int16, arrow::Int16Builder) \
M(UInt32, arrow::UInt32Builder) \
M(Int32, arrow::Int32Builder) \
M(UInt64, arrow::UInt64Builder) \
M(Int64, arrow::Int64Builder) \
M(Float32, arrow::FloatBuilder) \
M(Float64, arrow::DoubleBuilder)
const std::unordered_map<String, std::shared_ptr<arrow::DataType>> internal_type_to_arrow_type = {
{"UInt8", arrow::uint8()},
{"Int8", arrow::int8()},
{"UInt16", arrow::uint16()},
{"Int16", arrow::int16()},
{"UInt32", arrow::uint32()},
{"Int32", arrow::int32()},
{"UInt64", arrow::uint64()},
{"Int64", arrow::int64()},
{"Float32", arrow::float32()},
{"Float64", arrow::float64()},
//{"Date", arrow::date64()},
//{"Date", arrow::date32()},
{"Date", arrow::uint16()}, // CHECK
//{"DateTime", arrow::date64()}, // BUG! saves as date32
{"DateTime", arrow::uint32()},
// TODO: ClickHouse can actually store non-utf8 strings!
{"String", arrow::utf8()},
{"FixedString", arrow::utf8()},
};
const PaddedPODArray<UInt8> * extractNullBytemapPtr(ColumnPtr column)
{
ColumnPtr null_column = static_cast<const ColumnNullable &>(*column).getNullMapColumnPtr();
const PaddedPODArray<UInt8> & null_bytemap = static_cast<const ColumnVector<UInt8> &>(*null_column).getData();
return &null_bytemap;
}
class OstreamOutputStream : public parquet::OutputStream
{
public:
explicit OstreamOutputStream(WriteBuffer & ostr_) : ostr(ostr_) {}
virtual ~OstreamOutputStream() {}
virtual void Close() {}
virtual int64_t Tell() { return total_length; }
virtual void Write(const uint8_t * data, int64_t length)
{
ostr.write(reinterpret_cast<const char *>(data), length);
total_length += length;
}
private:
WriteBuffer & ostr;
int64_t total_length = 0;
PARQUET_DISALLOW_COPY_AND_ASSIGN(OstreamOutputStream);
};
void ParquetBlockOutputStream::write(const Block & block)
{
block.checkNumberOfRows();
const size_t columns_num = block.columns();
/// For arrow::Schema and arrow::Table creation
std::vector<std::shared_ptr<arrow::Field>> arrow_fields;
std::vector<std::shared_ptr<arrow::Array>> arrow_arrays;
arrow_fields.reserve(columns_num);
arrow_arrays.reserve(columns_num);
for (size_t column_i = 0; column_i < columns_num; ++column_i)
{
// TODO: constructed every iteration
const ColumnWithTypeAndName & column = block.safeGetByPosition(column_i);
const bool is_column_nullable = column.type->isNullable();
const auto & column_nested_type
= is_column_nullable ? static_cast<const DataTypeNullable *>(column.type.get())->getNestedType() : column.type;
const std::string column_nested_type_name = column_nested_type->getFamilyName();
if (isDecimal(column_nested_type))
{
const auto add_decimal_field = [&](const auto & types) -> bool {
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<
ToDataType,
DataTypeDecimal<
Decimal32>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
arrow_fields.emplace_back(std::make_shared<arrow::Field>(
column.name, arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()), is_column_nullable));
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), add_decimal_field);
}
else
{
if (internal_type_to_arrow_type.find(column_nested_type_name) == internal_type_to_arrow_type.end())
{
throw Exception{"The type \"" + column_nested_type_name + "\" of a column \"" + column.name
+ "\""
" is not supported for conversion into a Parquet data format",
ErrorCodes::UNKNOWN_TYPE};
}
arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, internal_type_to_arrow_type.at(column_nested_type_name), is_column_nullable));
}
std::shared_ptr<arrow::Array> arrow_array;
ColumnPtr nested_column
= is_column_nullable ? static_cast<const ColumnNullable &>(*column.column).getNestedColumnPtr() : column.column;
const PaddedPODArray<UInt8> * null_bytemap = is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr;
if ("String" == column_nested_type_name)
{
fillArrowArrayWithStringColumnData<ColumnString>(nested_column, arrow_array, null_bytemap);
}
else if ("FixedString" == column_nested_type_name)
{
fillArrowArrayWithStringColumnData<ColumnFixedString>(nested_column, arrow_array, null_bytemap);
}
else if ("Date" == column_nested_type_name)
{
fillArrowArrayWithDateColumnData(nested_column, arrow_array, null_bytemap);
}
else if ("DateTime" == column_nested_type_name)
{
fillArrowArrayWithDateTimeColumnData(nested_column, arrow_array, null_bytemap);
}
else if (isDecimal(column_nested_type))
{
auto fill_decimal = [&](const auto & types) -> bool
{
using Types = std::decay_t<decltype(types)>;
using ToDataType = typename Types::LeftType;
if constexpr (
std::is_same_v<
ToDataType,
DataTypeDecimal<
Decimal32>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
{
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
fillArrowArrayWithDecimalColumnData(nested_column, arrow_array, null_bytemap, decimal_type);
}
return false;
};
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), fill_decimal);
}
# define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \
{ \
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(nested_column, arrow_array, null_bytemap); \
}
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
# undef DISPATCH
else
{
throw Exception{"Internal type \"" + column_nested_type_name + "\" of a column \"" + column.name
+ "\""
" is not supported for conversion into a Parquet data format",
ErrorCodes::UNKNOWN_TYPE};
}
arrow_arrays.emplace_back(std::move(arrow_array));
}
std::shared_ptr<arrow::Schema> arrow_schema = std::make_shared<arrow::Schema>(std::move(arrow_fields));
std::shared_ptr<arrow::Table> arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays);
auto sink = std::make_shared<OstreamOutputStream>(ostr);
if (!file_writer)
{
parquet::WriterProperties::Builder builder;
#if USE_SNAPPY
builder.compression(parquet::Compression::SNAPPY);
#endif
auto props = builder.build();
auto status = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props, /*parquet::default_writer_properties(),*/
parquet::arrow::default_arrow_writer_properties(),
&file_writer);
if (!status.ok())
throw Exception{"Error while opening a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
// TODO: calculate row_group_size depending on a number of rows and table size
auto status = file_writer->WriteTable(*arrow_table, format_settings.parquet.row_group_size);
if (!status.ok())
throw Exception{"Error while writing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
void ParquetBlockOutputStream::writeSuffix()
{
if (file_writer)
{
auto status = file_writer->Close();
if (!status.ok())
throw Exception{"Error while closing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
}
}
void registerOutputFormatParquet(FormatFactory & factory)
{
factory.registerOutputFormat(
"Parquet", [](WriteBuffer & buf, const Block & sample, const Context & /*context*/, const FormatSettings & format_settings)
{
BlockOutputStreamPtr impl = std::make_shared<ParquetBlockOutputStream>(buf, sample, format_settings);
auto res = std::make_shared<SquashingBlockOutputStream>(impl, impl->getHeader(), format_settings.parquet.row_group_size, 0);
res->disableFlush();
return res;
});
}
}
#else
namespace DB
{
class FormatFactory;
void registerOutputFormatParquet(FormatFactory &)
{
}
}
#endif

View File

@ -0,0 +1,46 @@
#pragma once
#include <Common/config.h>
#if USE_PARQUET
# include <DataStreams/IBlockOutputStream.h>
# include <Formats/FormatSettings.h>
namespace arrow
{
class Array;
class DataType;
}
namespace parquet
{
namespace arrow
{
class FileWriter;
}
}
namespace DB
{
class ParquetBlockOutputStream : public IBlockOutputStream
{
public:
ParquetBlockOutputStream(WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings);
Block getHeader() const override { return header; }
void write(const Block & block) override;
void writeSuffix() override;
void flush() override;
String getContentType() const override { return "application/octet-stream"; }
private:
WriteBuffer & ostr;
Block header;
const FormatSettings format_settings;
std::unique_ptr<parquet::arrow::FileWriter> file_writer;
};
}
#endif

View File

@ -1,11 +1,10 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Formats/TabSeparatedRowOutputStream.h>
namespace DB
{
struct FormatSettings;
/** A stream for outputting data in tsv format, but without escaping individual values.
* (That is, the output is irreversible.)

View File

@ -618,7 +618,7 @@ inline void readDigits(ReadBuffer & buf, T & x, unsigned int & digits, int & exp
++places; // num zeroes before + current digit
if (digits + places > max_digits)
throw Exception("Too many digits in decimal value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
throw Exception("Too many digits (" + std::to_string(digits + places) + " > " + std::to_string(max_digits) + ") in decimal value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
digits += places;
if (after_point)

View File

@ -165,6 +165,7 @@ struct Settings
M(SettingUInt64, output_format_pretty_max_rows, 10000, "Rows limit for Pretty formats.") \
M(SettingUInt64, output_format_pretty_max_column_pad_width, 250, "Maximum width to pad all values in a column in Pretty formats.") \
M(SettingBool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats") \
M(SettingUInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.") \
\
M(SettingBool, use_client_time_zone, false, "Use client timezone for interpreting DateTime string values, instead of adopting server timezone.") \
\

View File

@ -31,6 +31,7 @@ const char * auto_config_build[]
"BUILD_COMPILE_DEFINITIONS", "@BUILD_COMPILE_DEFINITIONS@",
"BUILD_INCLUDE_DIRECTORIES", "@BUILD_INCLUDE_DIRECTORIES@",
"STATIC", "@USE_STATIC_LIBRARIES@",
"SPLIT_BINARY", "@CLICKHOUSE_SPLIT_BINARY@",
"USE_EMBEDDED_COMPILER", "@USE_EMBEDDED_COMPILER@",
"USE_INTERNAL_MEMCPY", "@USE_INTERNAL_MEMCPY@",
"USE_GLIBC_COMPATIBILITY", "@GLIBC_COMPATIBILITY@",
@ -48,6 +49,10 @@ const char * auto_config_build[]
"USE_POCO_MONGODB", "@USE_POCO_MONGODB@",
"USE_POCO_NETSSL", "@USE_POCO_NETSSL@",
"USE_BASE64", "@USE_BASE64@",
"USE_XXHASH", "@USE_XXHASH@",
"USE_HDFS", "@USE_HDFS@",
"USE_SNAPPY", "@USE_SNAPPY@",
"USE_PARQUET", "@USE_PARQUET@",
"USE_PROTOBUF", "@USE_PROTOBUF@",
"USE_BROTLI", "@USE_BROTLI@",

View File

@ -240,5 +240,3 @@ SELECT toUInt64('9223372036854775809') AS x, toDecimal64(x, 0); -- { serverError
SELECT toDecimal32(0, rowNumberInBlock()); -- { serverError 44 }
SELECT toDecimal64(0, rowNumberInBlock()); -- { serverError 44 }
SELECT toDecimal128(0, rowNumberInBlock()); -- { serverError 44 }
DROP TABLE IF EXISTS test.decimal;

View File

@ -0,0 +1,62 @@
9999
9998
9997
9996
9995
9994
9993
9992
9991
9990
99999
99998
99997
99996
99995
99994
99993
99992
99991
99990
2
1
0
999
998
997
996
995
994
993
992
991
990
ContextLock Number of times the lock of Context was acquired or tried to acquire. This is global lock.
Query Number of queries started to be interpreted and maybe executed. Does not include queries that are failed to parse, that are rejected due to AST size limits; rejected due to quota limits or limits on number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.
original:
-128 0 -32768 0 -2147483648 0 -9223372036854775808 0 -1.032 -1.064 string-1 fixedstring-1\0\0 2003-04-05 2003-02-03 04:05:06
-108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06
converted:
-128 0 -32768 0 -2147483648 0 -9223372036854775808 0 -1.032 -1.064 string-1 fixedstring-1\0\0 2003-04-05 2003-02-03 04:05:06
-108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06
diff:
dest:
79 81 82 83 84 85 86 87 88 89 str01\0\0\0\0\0\0\0\0\0\0 fstr1\0\0\0\0\0\0\0\0\0\0 2003-03-04 1970-01-01 06:29:04
80 81 82 83 84 85 86 87 88 89 str02 fstr2\0\0\0\0\0\0\0\0\0\0 2005-03-04 2006-08-09 10:11:12
min:
-128 0 0 0 0 0 0 0 -1 -1 string-1\0\0\0\0\0\0\0 fixedstring-1\0\0 2003-04-05 2003-02-03
-108 108 8 92 -8 108 -40 -116 -1 -1 string-0\0\0\0\0\0\0\0 fixedstring\0\0\0\0 2001-02-03 2002-02-03
79 81 82 83 84 85 86 87 88 89 str01\0\0\0\0\0\0\0\0\0\0 fstr1\0\0\0\0\0\0\0\0\0\0 2003-03-04 2004-05-06
127 -1 -1 -1 -1 -1 -1 -1 -1 -1 string-2\0\0\0\0\0\0\0 fixedstring-2\0\0 2004-06-07 2004-02-03
max:
-128 0 -32768 0 -2147483648 0 -9223372036854775808 0 -1 -1 string-1 fixedstring-1\0\0 1970-01-01 06:22:27 2003-02-03 04:05:06
-108 108 -1016 1116 -1032 1132 -1064 1164 -1 -1 string-0 fixedstring\0\0\0\0 1970-01-01 06:09:16 2002-02-03 04:05:06
80 81 82 83 84 85 86 87 88 89 str02 fstr2 2005-03-04 05:06:07 2006-08-09 10:11:12
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1 -1 string-2 fixedstring-2\0\0 1970-01-01 06:29:36 2004-02-03 04:05:06
dest from null:
-128 0 -32768 0 -2147483648 0 -9223372036854775808 0 -1.032 -1.064 string-1 fixedstring-1\0\0 2003-04-05 2003-02-03 04:05:06
-108 108 -1016 1116 -1032 1132 -1064 1164 -1.032 -1.064 string-0 fixedstring\0\0\0\0 2001-02-03 2002-02-03 04:05:06
127 255 32767 65535 2147483647 4294967295 9223372036854775807 9223372036854775807 -1.032 -1.064 string-2 fixedstring-2\0\0 2004-06-07 2004-02-03 04:05:06
\N \N \N \N \N \N \N \N \N \N \N \N \N \N

View File

@ -0,0 +1,136 @@
#!/usr/bin/env bash
set -e
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CUR_DIR/../shell_config.sh
#${CLICKHOUSE_CLIENT} --max_block_size=1 --query="SELECT * FROM system.numbers LIMIT 10 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t1.pq
#${CLICKHOUSE_CLIENT} --max_block_size=5 --query="SELECT * FROM system.numbers LIMIT 10 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t5.pq
#${CLICKHOUSE_CLIENT} --max_block_size=15 --query="SELECT * FROM system.numbers LIMIT 10 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t15.pq
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 100000 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t100000.pq
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 1000000000 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t1g.pq
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 100000000 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t100m.pq
#${CLICKHOUSE_CLIENT} --max_block_size=100000000 --query="SELECT * FROM system.numbers LIMIT 100000000 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t100m-100mbs.pq
#valgrind --tool=massif ${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 1000000 FORMAT Parquet" > ${CLICKHOUSE_TMP}/t1g.pq
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.contributors"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.contributors (name String) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.contributors ORDER BY name DESC FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.contributors FORMAT Parquet"
# random results
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.contributors LIMIT 10" > /dev/null
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.contributors"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_numbers"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_numbers (number UInt64) ENGINE = Memory"
# less than default block size (65k)
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 10000 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_numbers FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_numbers ORDER BY number DESC LIMIT 10"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_numbers"
# More than default block size
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 100000 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_numbers FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_numbers ORDER BY number DESC LIMIT 10"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_numbers"
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.numbers LIMIT 10000000 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_numbers FORMAT Parquet"
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_numbers ORDER BY number DESC LIMIT 10"
#${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_numbers"
#${CLICKHOUSE_CLIENT} --max_block_size=2 --query="SELECT * FROM system.numbers LIMIT 3 FORMAT Parquet" > ${CLICKHOUSE_TMP}/bs2.pq
${CLICKHOUSE_CLIENT} --max_block_size=2 --query="SELECT * FROM system.numbers LIMIT 3 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_numbers FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_numbers ORDER BY number DESC LIMIT 10"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_numbers"
${CLICKHOUSE_CLIENT} --max_block_size=1 --query="SELECT * FROM system.numbers LIMIT 1000 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_numbers FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_numbers ORDER BY number DESC LIMIT 10"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_numbers"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_events"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_events (event String, value UInt64, description String) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.events FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_events FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT event, description FROM test.parquet_events WHERE event IN ('ContextLock', 'Query') ORDER BY event"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_events"
#${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types1"
#${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types2"
#${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types1 (int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, int64 Int64, uint64 UInt64, float32 Float32, float64 Float64, string String ) ENGINE = Memory"
#${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types2 (int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, int64 Int64, uint64 UInt64, float32 Float32, float64 Float64, string String ) ENGINE = Memory"
#${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types1 values ( -108, 108, -1016, 1116, -1032, -1064, 1164, -1.032, -1.064, 'string' )"
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types1"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types2"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types3"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types4"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types1 (int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, uint32 UInt32, int64 Int64, uint64 UInt64, float32 Float32, float64 Float64, string String, fixedstring FixedString(15), date Date, datetime DateTime) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types2 (int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, uint32 UInt32, int64 Int64, uint64 UInt64, float32 Float32, float64 Float64, string String, fixedstring FixedString(15), date Date, datetime DateTime) ENGINE = Memory"
# convert min type
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types3 (int8 Int8, uint8 Int8, int16 Int8, uint16 Int8, int32 Int8, uint32 Int8, int64 Int8, uint64 Int8, float32 Int8, float64 Int8, string FixedString(15), fixedstring FixedString(15), date Date, datetime Date) ENGINE = Memory"
# convert max type
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types4 (int8 Int64, uint8 Int64, int16 Int64, uint16 Int64, int32 Int64, uint32 Int64, int64 Int64, uint64 Int64, float32 Int64, float64 Int64, string String, fixedstring String, date DateTime, datetime DateTime) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types1 values ( -108, 108, -1016, 1116, -1032, 1132, -1064, 1164, -1.032, -1.064, 'string-0', 'fixedstring', '2001-02-03', '2002-02-03 04:05:06')"
# min
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types1 values ( -128, 0, -32768, 0, -2147483648, 0, -9223372036854775808, 0, -1.032, -1.064, 'string-1', 'fixedstring-1', '2003-04-05', '2003-02-03 04:05:06')"
# max
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types1 values ( 127, 255, 32767, 65535, 2147483647, 4294967295, 9223372036854775807, 9223372036854775807, -1.032, -1.064, 'string-2', 'fixedstring-2', '2004-06-07', '2004-02-03 04:05:06')"
# 'SELECT -127,-128,-129,126,127,128,255,256,257,-32767,-32768,-32769,32766,32767,32768,65535,65536,65537, -2147483647,-2147483648,-2147483649,2147483646,2147483647,2147483648,4294967295,4294967296,4294967297, -9223372036854775807,-9223372036854775808,9223372036854775806,9223372036854775807,9223372036854775808,18446744073709551615';
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types2 FORMAT Parquet"
echo original:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 ORDER BY int8" | tee ${CLICKHOUSE_TMP}/parquet_all_types_1.dump
echo converted:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types2 ORDER BY int8" | tee ${CLICKHOUSE_TMP}/parquet_all_types_2.dump
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 ORDER BY int8 FORMAT Parquet" > ${CLICKHOUSE_TMP}/parquet_all_types_1.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types2 ORDER BY int8 FORMAT Parquet" > ${CLICKHOUSE_TMP}/parquet_all_types_2.parquet
echo diff:
diff ${CLICKHOUSE_TMP}/parquet_all_types_1.dump ${CLICKHOUSE_TMP}/parquet_all_types_2.dump
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_types2"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types3 values ( 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 'str01', 'fstr1', '2003-03-04', '2004-05-06')"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types3 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types3 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types4 values ( 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 'str02', 'fstr2', '2005-03-04 05:06:07', '2006-08-09 10:11:12')"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types4 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types4 FORMAT Parquet"
echo dest:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types2 ORDER BY int8"
echo min:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types3 ORDER BY int8"
echo max:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types4 ORDER BY int8"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types5"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_types6"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.parquet_types2"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types5 (int8 Nullable(Int8), uint8 Nullable(UInt8), int16 Nullable(Int16), uint16 Nullable(UInt16), int32 Nullable(Int32), uint32 Nullable(UInt32), int64 Nullable(Int64), uint64 Nullable(UInt64), float32 Nullable(Float32), float64 Nullable(Float64), string Nullable(String), fixedstring Nullable(FixedString(15)), date Nullable(Date), datetime Nullable(DateTime)) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_types6 (int8 Nullable(Int8), uint8 Nullable(UInt8), int16 Nullable(Int16), uint16 Nullable(UInt16), int32 Nullable(Int32), uint32 Nullable(UInt32), int64 Nullable(Int64), uint64 Nullable(UInt64), float32 Nullable(Float32), float64 Nullable(Float64), string Nullable(String), fixedstring Nullable(FixedString(15)), date Nullable(Date), datetime Nullable(DateTime)) ENGINE = Memory"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types5 values ( NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types5 ORDER BY int8 FORMAT Parquet" > ${CLICKHOUSE_TMP}/parquet_all_types_5.parquet
#${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types5 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types6 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types5 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types6 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types1 ORDER BY int8 FORMAT Parquet" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_types6 FORMAT Parquet"
echo dest from null:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_types6 ORDER BY int8"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types5"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types6"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types1"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types2"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types3"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_types4"

View File

@ -0,0 +1,54 @@
#!/usr/bin/env perl
package parquet_create_table_columns;
use strict;
no warnings 'experimental';
use feature 'signatures';
use JSON::XS;
#use Data::Dumper;
sub file_read($file) {
open my $f, '<', $file or return;
local $/ = undef;
my $ret = <$f>;
close $f;
return $ret;
}
our $type_parquet_logical_to_clickhouse = {
DECIMAL => 'Decimal128(1)',
TIMESTAMP_MICROS => 'DateTime',
TIMESTAMP_MILLIS => 'DateTime',
};
our $type_parquet_physical_to_clickhouse = {
BOOLEAN => 'UInt8',
INT32 => 'Int32',
INT64 => 'Int64',
FLOAT => 'Float32',
DOUBLE => 'Float64',
BYTE_ARRAY => 'String',
FIXED_LEN_BYTE_ARRAY => 'String', # Maybe FixedString?
INT96 => 'Int64', # TODO!
};
sub columns ($json) {
my @list;
my %uniq;
for my $column (@{$json->{Columns}}) {
#warn Data::Dumper::Dumper $column;
my $name = $column->{'Name'};
my $type = $type_parquet_logical_to_clickhouse->{$column->{'LogicalType'}} || $type_parquet_physical_to_clickhouse->{$column->{'PhysicalType'}};
unless ($type) {
warn "Unknown type [$column->{'PhysicalType'}:$column->{'LogicalType'}] of column [$name]";
}
$type = "Nullable($type)";
$name .= $column->{'Id'} if $uniq{$name}++; # Names can be non-unique
push @list, {name => $name, type => $type};
}
print join ', ', map {"$_->{name} $_->{type}"} @list;
}
sub columns_file ($file) {
return columns(JSON::XS::decode_json(file_read($file)));
}
columns_file(shift) unless caller;

View File

@ -0,0 +1,17 @@
diff0:
diff1:
diff2:
nothing:
nulls:
\N \N \N \N
full orig:
1 \N \N \N
\N 1 \N \N
\N \N 1 \N
\N \N \N \N
full inserted:
1 \N \N \N
\N 1 \N \N
\N \N 1 \N
\N \N \N \N
diff3:

View File

@ -0,0 +1,111 @@
#!/usr/bin/env bash
# set -x
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CUR_DIR/../shell_config.sh
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal;"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal2;"
# Simple small values
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal (a DECIMAL(9,0), b DECIMAL(18,0), c DECIMAL(38,0), d DECIMAL(9, 9), e DECIMAL(18, 18), f DECIMAL(38, 38), g Decimal(9, 5), h decimal(18, 9), i deciMAL(38, 18), j DECIMAL(1,0)) ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal2 AS test.decimal ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);"
#${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (1, 1, 1, 0.1, 0.1, 1, 1, 1, 1, 1);"
#${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (10, 10, 10, 0.1, 0.1, 0.1, 10, 10, 10, 10);"
#${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (-100, -100, -100, -0.1, -0.1, -0.1, -100, -100, -100, -100);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c) VALUES (1, 1, 1);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c) VALUES (10, 10, 10);"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal0_1.dump
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal0.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal0_2.dump
echo diff0:
diff ${CLICKHOUSE_TMP}/parquet_decimal0_1.dump ${CLICKHOUSE_TMP}/parquet_decimal0_2.dump
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal;"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal2;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal ( a DECIMAL(9,0), b DECIMAL(18,0), c DECIMAL(38,0), d DECIMAL(9, 9), e DECIMAL(18, 18), f DECIMAL(38, 38), g Decimal(9, 5), h decimal(18, 9), i deciMAL(38, 18), j DECIMAL(1,0)) ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal2 AS test.decimal ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, d, g) VALUES (999999999, 999999999999999999, 0.999999999, 9999.99999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, d, g) VALUES (-999999999, -999999999999999999, -0.999999999, -9999.99999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (c) VALUES (99999999999999999999999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (c) VALUES (-99999999999999999999999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (f) VALUES (0.99999999999999999999999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (f) VALUES (-0.99999999999999999999999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (e, h) VALUES (0.999999999999999999, 999999999.999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (e, h) VALUES (-0.999999999999999999, -999999999.999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (i) VALUES (99999999999999999999.999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (i) VALUES (-99999999999999999999.999999999999999999);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, g, j, h) VALUES (1, 1, 1, 0.000000001, 0.00001, 1, 0.000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, g, j, h) VALUES (-1, -1, -1, -0.000000001, -0.00001, -1, -0.000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (e, f) VALUES (0.000000000000000001, 0.00000000000000000000000000000000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (e, f) VALUES (-0.000000000000000001, -0.00000000000000000000000000000000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (i) VALUES (0.000000000000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (i) VALUES (-0.000000000000000001);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (-0, -0, -0, -0, -0, -0, -0, -0, -0, -0);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, g) VALUES ('42.00000', 42.0000000000000000000000000000000, '0.999990');"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f) VALUES ('0.9e9', '0.9e18', '0.9e38', '9e-9', '9e-18', '9e-38');"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f) VALUES ('-0.9e9', '-0.9e18', '-0.9e38', '-9e-9', '-9e-18', '-9e-38');"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal1_1.dump
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal1.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal1_2.dump
echo diff1:
diff ${CLICKHOUSE_TMP}/parquet_decimal1_1.dump ${CLICKHOUSE_TMP}/parquet_decimal1_2.dump
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal;"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal2;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal (a DECIMAL(9,0), b DECIMAL(18,0), c DECIMAL(38,0), d DECIMAL(9, 9), e Decimal64(18), f Decimal128(38), g Decimal32(5), h Decimal64(9), i Decimal128(18), j dec(4,2)) ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal2 AS test.decimal ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (42, 42, 42, 0.42, 0.42, 0.42, 42.42, 42.42, 42.42, 42.42);"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal (a, b, c, d, e, f, g, h, i, j) VALUES (-42, -42, -42, -0.42, -0.42, -0.42, -42.42, -42.42, -42.42, -42.42);"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal2_1.dump
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal2.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d, e, f, g, h, i, j FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d, e, f, g, h, i, j;" > ${CLICKHOUSE_TMP}/parquet_decimal2_2.dump
echo diff2:
diff ${CLICKHOUSE_TMP}/parquet_decimal2_1.dump ${CLICKHOUSE_TMP}/parquet_decimal2_2.dump
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal;"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal2;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal (a Nullable(DECIMAL(9,0)), b Nullable(DECIMAL(18,0)), c Nullable(DECIMAL(38,0)), d Nullable(DECIMAL(9,0))) ENGINE = Memory;"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE IF NOT EXISTS test.decimal2 AS test.decimal ENGINE = Memory;"
# Empty table test
# throws No data to insert
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal3_1.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet" 2> /dev/null
echo nothing:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d;"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.decimal2;"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal VALUES (Null, Null, Null, Null)"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal3_2.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet"
echo nulls:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d;"
${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test.decimal2;"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal VALUES (1, Null, Null, Null)"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal VALUES (Null, 1, Null, Null)"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal VALUES (Null, Null, 1, Null)"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" > ${CLICKHOUSE_TMP}/parquet_decimal3_3.parquet
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d FORMAT Parquet;" | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.decimal2 FORMAT Parquet"
echo full orig:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal ORDER BY a, b, c, d;"
echo full inserted:
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d;"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d;" > ${CLICKHOUSE_TMP}/parquet_decimal3_1.dump
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.decimal2 ORDER BY a, b, c, d;" > ${CLICKHOUSE_TMP}/parquet_decimal3_2.dump
echo diff3:
diff ${CLICKHOUSE_TMP}/parquet_decimal3_1.dump ${CLICKHOUSE_TMP}/parquet_decimal3_2.dump
#${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal;"
#${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.decimal2;"

View File

@ -0,0 +1,729 @@
=== Try load data from alltypes_dictionary.parquet
0 1 0 0 0 0 0 0 01/01/09 0 1230768000
1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060
=== Try load data from alltypes_plain.parquet
4 1 0 0 0 0 0 0 03/01/09 0 1235865600
5 0 1 1 1 10 1.1 10.1 03/01/09 1 1235865660
6 1 0 0 0 0 0 0 04/01/09 0 1238544000
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
2 1 0 0 0 0 0 0 02/01/09 0 1233446400
3 0 1 1 1 10 1.1 10.1 02/01/09 1 1233446460
0 1 0 0 0 0 0 0 01/01/09 0 1230768000
1 0 1 1 1 10 1.1 10.1 01/01/09 1 1230768060
=== Try load data from alltypes_plain.snappy.parquet
6 1 0 0 0 0 0 0 04/01/09 0 1238544000
7 0 1 1 1 10 1.1 10.1 04/01/09 1 1238544060
=== Try load data from byte_array_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
=== Try load data from datapage_v2.snappy.parquet
Code: 33. DB::Ex---tion: Error while reading parquet data: IOError: Arrow error: IOError: Corrupt snappy compressed data.
=== Try load data from fixed_length_decimal_1.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
=== Try load data from fixed_length_decimal_legacy.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
=== Try load data from int32_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
=== Try load data from int64_decimal.parquet
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
=== Try load data from nation.dict-malformed.parquet
0 ALGERIA 0 haggle. carefully final deposits detect slyly agai
1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon
2 BRAZIL 1 y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special
3 CANADA 1 eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold
4 EGYPT 4 y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d
5 ETHIOPIA 0 ven packages wake quickly. regu
6 FRANCE 3 refully final requests. regular, ironi
7 GERMANY 3 l platelets. regular accounts x-ray: unusual, regular acco
8 INDIA 2 ss excuses cajole slyly across the packages. deposits print aroun
9 INDONESIA 2 slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull
10 IRAN 4 efully alongside of the slyly final dependencies.
11 IRAQ 4 nic deposits boost atop the quickly final requests? quickly regula
12 JAPAN 2 ously. final, express gifts cajole a
13 JORDAN 4 ic deposits are blithely about the carefully regular pa
14 KENYA 0 pending excuses haggle furiously deposits. pending, express pinto beans wake fluffily past t
15 MOROCCO 0 rns. blithely bold courts among the closely regular packages use furiously bold platelets?
16 MOZAMBIQUE 0 s. ironic, unusual asymptotes wake blithely r
17 PERU 1 platelets. blithely pending dependencies use fluffily across the even pinto beans. carefully silent accoun
18 CHINA 2 c dependencies. furiously express notornis sleep slyly regular accounts. ideas sleep. depos
19 ROMANIA 3 ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account
20 SAUDI ARABIA 4 ts. silent requests haggle. closely express packages sleep across the blithely
21 VIETNAM 2 hely enticingly express accounts. even, final
22 RUSSIA 3 requests against the platelets use never according to the quickly regular pint
23 UNITED KINGDOM 3 eans boost carefully special requests. accounts are. carefull
24 UNITED STATES 1 y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be
=== Try load data from nested_lists.snappy.parquet
Code: 8. DB::Ex---tion: Column "element" is not presented in input data
=== Try load data from nested_maps.snappy.parquet
Code: 33. DB::Ex---tion: Error while reading parquet data: NotImplemented: Currently only nesting with Lists is supported.
=== Try load data from nonnullable.impala.parquet
Code: 33. DB::Ex---tion: Error while reading parquet data: NotImplemented: Currently only nesting with Lists is supported.
=== Try load data from nullable.impala.parquet
Code: 33. DB::Ex---tion: Error while reading parquet data: NotImplemented: Currently only nesting with Lists is supported.
=== Try load data from nulls.snappy.parquet
Code: 8. DB::Ex---tion: Column "b_c_int" is not presented in input data
=== Try load data from repeated_no_annotation.parquet
Code: 33. DB::Ex---tion: Error while reading parquet data: NotImplemented: Currently only nesting with Lists is supported.
=== Try load data from userdata1.parquet
1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02
1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV
1454461771 3 Evelyn Morgan emorgan2@altervista.org Female 7.161.136.94 6767119071901597 Russia 2/1/1960 144972.51 Structural Engineer
1454459781 4 Denise Riley driley3@gmpg.org Female 140.35.109.83 3576031598965625 China 4/8/1997 90263.05 Senior Cost Accountant
1454475931 5 Carlos Burns cburns4@miitbeian.gov.cn 169.113.235.40 5602256255204850 South Africa \N
1454484154 6 Kathryn White kwhite5@google.com Female 195.131.81.179 3583136326049310 Indonesia 2/25/1983 69227.11 Account Executive
1454488388 7 Samuel Holmes sholmes6@foxnews.com Male 232.234.81.197 3582641366974690 Portugal 12/18/1987 14247.62 Senior Financial Analyst
1454482026 8 Harry Howell hhowell7@eepurl.com Male 91.235.51.73 Bosnia and Herzegovina 3/1/1962 186469.43 Web Developer IV
1454471573 9 Jose Foster jfoster8@yelp.com Male 132.31.53.61 South Korea 3/27/1992 231067.84 Software Test Engineer I 1E+02
1454524187 10 Emily Stewart estewart9@opensource.org Female 143.28.251.245 3574254110301671 Nigeria 1/28/1997 27234.28 Health Coach IV
1454458242 11 Susan Perkins sperkinsa@patch.com Female 180.85.0.62 3573823609854134 Russia 210001.95
1454522674 12 Alice Berry aberryb@wikipedia.org Female 246.225.12.189 4917830851454417 China 8/12/1968 22944.53 Quality Engineer
1454525297 13 Justin Berry jberryc@usatoday.com Male 157.7.146.43 6331109912871813274 Zambia 8/15/1975 44165.46 Structural Analysis Engineer
1454536012 14 Kathy Reynolds kreynoldsd@redcross.org Female 81.254.172.13 5537178462965976 Bosnia and Herzegovina 6/27/1970 286592.99 Librarian
1454489603 15 Dorothy Hudson dhudsone@blogger.com Female 8.59.7.0 3542586858224170 Japan 12/20/1989 157099.71 Nurse Practicioner <script>alert(\'hi\')</script>
1454460241 16 Bruce Willis bwillisf@bluehost.com Male 239.182.219.189 3573030625927601 Brazil 239100.65
1454461065 17 Emily Andrews eandrewsg@cornell.edu Female 29.231.180.172 30271790537626 Russia 4/13/1990 116800.65 Food Chemist
1454517864 18 Stephen Wallace swallaceh@netvibes.com Male 152.49.213.62 5433943468526428 Ukraine 1/15/1978 248877.99 Account Representative I
1454499954 19 Clarence Lawson clawsoni@vkontakte.ru Male 107.175.15.152 3544052814080964 Russia 177122.99
1454495436 20 Rebecca Bell rbellj@bandcamp.com Female 172.215.104.127 China 137251.19
1454505444 21 Diane Stevens dstevensk@cnet.com Female 141.243.73.164 Russia 6/5/1985 87978.22 Food Chemist œ∑´®†¥¨ˆøπ“‘
1454523505 22 Lawrence Ramos lramosl@sourceforge.net Male 46.72.4.6 3537473810855655 Tanzania 131283.64
1454525455 23 Gregory Barnes gbarnesm@google.ru Male 220.22.114.145 3538432455620641 Tunisia 1/23/1971 182233.49 Senior Sales Associate 사회과학원 어학연구소
1454472340 24 Michelle Ellis mellisn@timesonline.co.uk Female 239.81.215.135 3547383558025965 Tanzania 6/5/1964 278001.46 Tax Accountant
1454518347 25 Rachel Perkins rperkinso@lulu.com Female 90.173.28.95 633313663891003209 Russia 176178.75
1454486554 26 Anthony Lawrence alawrencep@miitbeian.gov.cn Male 121.211.242.99 564182969714151470 Japan 12/10/1979 170085.81 Electrical Engineer
1454488886 27 Henry Henry hhenryq@godaddy.com Male 191.88.236.116 4905730021217853521 China 9/22/1995 284300.15 Nuclear Power Engineer
1454519352 28 Samuel Hunter shunterr@instagram.com Male 72.190.230.173 5002353797389897 Brazil 9/21/1968 108950.24 Environmental Tech
1454469374 29 Jacqueline Holmes jholmess@ustream.tv Female 47.141.224.95 3555934842115316 United States 247939.52 ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟
1454535469 30 Annie Torres atorrest@ning.com Female 202.94.67.27 3530389861801215 Nigeria 5/20/1958 118310.72 Electrical Engineer -1E+02
1454526588 31 Antonio Berry aberryu@ow.ly Male 5.82.180.4 Thailand 135007.96
1454533547 32 Nicole Martinez nmartinezv@oakley.com Female 46.32.149.87 United States 149720.75 Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮
1454459459 33 Christina Mason cmasonw@nydailynews.com Female 74.214.22.120 Greece 7/21/1986 242593.85 Senior Sales Associate
1454541103 34 Margaret Barnes mbarnesx@angelfire.com Female 133.178.126.244 3582552005871223 South Africa 11/13/1969 109644.23 Human Resources Assistant II
1454487881 35 Melissa Kelly mkellyy@unblog.fr Female 179.132.207.169 6374648559206801 Indonesia 2/6/1968 45639.62 General Manager Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣
1454484472 36 Betty Carr bcarrz@parallels.com Female 159.201.161.49 France 91370.3 -1E2
1454532399 37 Dorothy Gomez dgomez10@jiathis.com Female 65.111.200.146 493684876859391834 China 57194.86
1454538878 38 Kathryn Lane klane11@netlog.com Female 169.141.178.89 5308993357499254 Czech Republic 8/20/1964 67783.73 Paralegal
1454511326 39 Jose Murphy jmurphy12@paypal.com Male 118.85.253.180 4994715164232848 Chile 8/8/1991 134708.82 Nuclear Power Engineer
1454458506 40 Jack Flores jflores13@yolasite.com Male 162.215.65.11 3577342788590928 Argentina 1/28/1958 81685.1 Financial Advisor
1454529124 41 Walter Martinez wmartinez14@spotify.com Male 165.150.92.96 Somalia 3/8/1972 212105.33 Health Coach I
1454473984 42 Todd Alvarez talvarez15@csmonitor.com Male 59.123.34.76 3557102122317535 Japan 12/19/1999 284728.99 Marketing Assistant
1454488466 43 Amanda Gray agray16@cdbaby.com Female 252.20.193.145 3561501596653859 China 8/28/1967 213410.26 Senior Quality Engineer
1454494415 44 Sharon Simpson ssimpson17@weather.com Female 242.68.147.87 France 9/28/1963 133884.94 Analog Circuit Design manager
1454526201 45 Bonnie Collins bcollins18@list-manage.com Female 132.217.56.27 3540813015762450 Germany 7/21/1986 67661.42 Business Systems Development Analyst
1454474597 46 Deborah Armstrong darmstrong19@addthis.com Female 89.44.11.142 Canada 4/8/1969 111569.22 Quality Control Specialist test
1454486980 47 Daniel Mccoy dmccoy1a@skype.com Male 115.85.247.190 3554507990607374 Central African Republic 66260.14 ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙
1454505529 48 Jean Flores jflores1b@samsung.com Female 211.70.131.207 5392903051983005 Nepal 4/6/1990 199100.32 Financial Advisor
1454521849 49 Lisa Snyder lsnyder1c@woothemes.com Female 145.202.177.215 30475362189761 Germany 12/12/1974 210631.91 Safety Technician II
1454469295 50 Sean Alexander salexander1d@dagondesign.com Male 89.83.147.177 Bosnia and Herzegovina 5/29/1978 256068.38 Senior Financial Analyst
1454481568 51 Ernest Carroll ecarroll1e@dailymail.co.uk Male 194.224.39.215 5100172156945078 Portugal 11/1/1992 100269.36 Dental Hygienist
1454492589 52 Louise Dean ldean1f@tamu.edu Female 109.43.178.48 201996646854139 Ethiopia 173300.37
1454457952 53 Ralph Price rprice1g@tmall.com Male 152.6.235.33 4844227560658222 China 8/26/1986 168208.4 Teacher
1454467269 54 George Ferguson gferguson1h@51.la Male 129.108.219.50 3539784298399554 Macedonia 6/26/1971 153238.6 Computer Systems Analyst IV パーティーへ行かないか
1454515393 55 Anna Montgomery amontgomery1i@google.cn Female 80.111.141.47 3586860392406446 China 9/6/1957 92837.5 Software Test Engineer IV 1E2
1454514049 56 Cheryl Lawrence clawrence1j@ameblo.jp Female 171.155.78.116 Finland 5/7/1985 200827.88 Recruiting Manager
1454459605 57 Willie Palmer wpalmer1k@t-online.de Male 164.107.46.161 4026614769857244 China 8/23/1986 184978.64 Environmental Specialist
1454478957 58 Arthur Berry aberry1l@unc.edu Male 52.42.24.55 3542761473624274 China 144164.88
1454519593 59 Patricia Marshall pmarshall1m@dell.com Female 47.108.196.175 China 7/21/1984 69236.54 Environmental Specialist
1454466852 60 Cynthia Richards crichards1n@dailymail.co.uk Female 178.236.66.213 3557986543874466 Brazil 179378
1454496286 61 David Sanders dsanders1o@fda.gov Male 94.143.190.8 3585745042921822 Mexico 2/15/1963 197445.45 Data Coordiator 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟
1454534081 62 Julia Sullivan jsullivan1p@wisc.edu Female 32.183.154.67 6767624411254094 Bolivia 11/28/1963 118311.39 Electrical Engineer
1454530379 63 Kevin Butler kbutler1q@symantec.com Male 21.88.110.64 3551107057688681 Georgia 12/13/1962 129632.55 Database Administrator III
1454475593 64 Dennis Ross dross1r@parallels.com Male 78.25.77.223 Portugal 5/27/1959 280933.71 Biostatistician II
1454478626 65 Raymond Jacobs rjacobs1s@sohu.com Male 188.52.98.175 5048378563875353 Indonesia 13673.35
1454532460 66 Steven Pierce spierce1t@usgs.gov Male 230.13.54.19 5100178880451481 Namibia 4/10/1965 152382.69 Analyst Programmer
1454480831 67 Jonathan Ellis jellis1u@g.co Male 125.115.227.203 China 4/5/1991 268468.96 Staff Scientist  
1454460516 68 Rachel Price rprice1v@census.gov Female 89.52.192.105 Indonesia 5/6/1982 234502.16 Payment Adjustment Coordinator
1454492257 69 Harold Olson holson1w@chronoengine.com Male 169.173.35.139 China 7/25/1994 146917.43 Occupational Therapist
1454524497 70 Pamela Wagner pwagner1x@gravatar.com Female 184.97.191.144 5593584893781844 Italy 5/3/1964 253108.75 Automation Specialist I 1;DROP TABLE users
1454537805 71 Stephanie Watkins swatkins1y@rakuten.co.jp 124.183.29.113 30552863095190 Burkina Faso 8/29/1971 \N Physical Therapy Assistant
1454530454 72 John Ortiz jortiz1z@mozilla.org Male 4.70.220.127 5194470971764378 Sweden 2/13/1978 91566.02 Analyst Programmer
1454523864 73 Kimberly Wheeler kwheeler20@imgur.com Female 26.46.50.55 China 11/6/1978 31026.94 Junior Executive
1454470404 74 Kathryn Henderson khenderson21@ask.com Female 218.212.63.68 4936394111685353310 Ukraine 4/11/1985 59413.85 Pharmacist -$1.00
1454527390 75 Catherine Gibson cgibson22@ebay.com Female 204.84.35.26 5402007176101895 Indonesia 12/20/1984 92315.94 Desktop Support Technician
1454509078 76 Carolyn Nelson cnelson23@tiny.cc Female 64.13.61.211 4844223687165886 Estonia 3/9/1985 179193.6 Social Worker
1454479055 77 Denise Nguyen dnguyen24@ovh.net Female 18.208.48.116 201900233821394 China 121013.48
1454458493 78 Mildred Torres mtorres25@alibaba.com Female 38.102.60.15 6399156779396437 Russia 9/24/1960 166987.55 Paralegal
1454507970 79 Linda Shaw lshaw26@psu.edu Female 188.221.197.229 3557917782902346 Russia 9/30/1987 67211.67 Structural Analysis Engineer
1454540546 80 Anna Hudson ahudson27@gmpg.org Female 153.84.219.15 Indonesia 9/12/1997 110408.87 VP Marketing
1454536800 81 Albert Pierce apierce28@phoca.cz Male 145.148.40.149 Palestinian Territory 11/4/1955 43019.01 Web Developer III 0/0
1454542995 82 Carol Franklin cfranklin29@marketwatch.com Female 32.189.30.244 67097647572873744 China 6/5/1978 31572.53 Automation Specialist II
1454506472 83 Carlos Washington cwashington2a@phpbb.com Male 90.239.40.124 67063904960748578 United States 11/4/1970 28853.61 Developer I ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙
1454463081 84 Kathryn Austin kaustin2b@livejournal.com Female 152.193.181.90 Philippines 10/8/1990 131855.43 Nurse Practicioner
1454494358 85 Lillian Gardner lgardner2c@hao123.com Female 189.104.46.70 Russia 10/28/1961 145282.64 Occupational Therapist
1454530407 86 Peter Mendoza pmendoza2d@paypal.com Male 77.225.63.206 3562330687037049 Mexico 12/23/1988 40664.88 Staff Scientist
1454466533 87 Dennis Torres dtorres2e@ask.com Male 199.131.129.105 50188330277167912 Croatia 5/25/1986 265985 Account Representative II 社會科學院語學研究所
1454463286 88 Timothy Watkins twatkins2f@toplist.cz Male 120.52.182.111 Tunisia 6/24/2000 242129.05 Operator
1454498394 89 Nicole Willis nwillis2g@cmu.edu Female 44.196.120.110 6394724888228638 Indonesia 2/1/1966 258772.36 Physical Therapy Assistant
1454525151 90 Jacqueline Carr jcarr2h@freewebs.com Female 197.40.38.49 201939989746686 China 5/31/1961 100733.44 Civil Engineer (。◕ ∀ ◕。)
1454510656 91 Theresa Gonzalez tgonzalez2i@nih.gov Female 237.106.229.219 Argentina 8/10/1970 47723.61 Product Engineer
1454479785 92 Donald Bradley dbradley2j@latimes.com Male 244.82.249.86 3534114122488321 Indonesia 7/8/2000 105051.77 Tax Accountant
1454512853 93 Katherine Little klittle2k@cyberchimps.com Female 61.43.154.182 30218284989094 Poland 1/20/1990 155597.16 Associate Professor
1454516486 94 Ruth Cooper rcooper2l@apache.org Female 114.82.62.61 Indonesia 7/20/1993 181481.5 Civil Engineer
1454498785 95 Stephen Gutierrez sgutierrez2m@walmart.com Male 134.231.189.30 3560204445825528 Guatemala 8/22/1995 83986.79 Structural Engineer
1454473160 96 Kevin Scott kscott2n@histats.com Male 226.59.43.229 3558997916332270 United States 6/5/1966 130054.63 Graphic Designer ÅÍÎÏ˝ÓÔÒÚÆ☃
1454540928 97 Steven Williamson swilliamson2o@devhub.com Male 122.216.99.88 France 238119.62
1454473451 98 Shawn Adams sadams2p@imdb.com Male 148.92.123.202 5893564746795315893 Indonesia 11/10/1959 67749.83 Senior Developer test
1454507278 99 Russell Fields rfields2q@google.ca Male 110.74.199.162 Tanzania 1/2/1994 13268.99 Mechanical Systems Engineer
1454514595 100 Willie Weaver wweaver2r@google.de Male 13.54.121.138 3534023246040472 Mexico 8/21/1970 175694.61 Dental Hygienist ̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰
=== Try load data from userdata2.parquet
1454506599 1 Donald Lewis dlewis0@clickbank.net Male 102.22.124.20 Indonesia 7/9/1972 140249.37 Senior Financial Analyst
1454458948 2 Walter Collins wcollins1@bloglovin.com Male 247.28.26.93 3587726269478025 China \N
1454524144 3 Michelle Henderson mhenderson2@geocities.jp Female 193.68.146.150 France 1/15/1964 236219.26 Teacher
1454506939 4 Lori Hudson lhudson3@dion.ne.jp 34.252.168.48 3568840151595649 Russia 4/22/1988 \N Nuclear Power Engineer
1454458529 5 Howard Miller hmiller4@fema.gov Male 103.193.150.230 3583473261055014 France 11/26/1998 50210.02 Senior Editor
1454496547 6 Frances Adams fadams5@123-reg.co.uk Female 106.196.106.93 Russia 3/27/1997 82175.77 Account Coordinator
1454528652 \N Steven Hanson shanson6@cisco.com Male 234.130.172.185 3550842607768119 Indonesia 129582.61
1454487094 8 Louis Simmons lsimmons7@icio.us Male 18.69.80.15 China 6/1/1992 90744.86 Product Engineer
1454543811 9 Keith Parker kparker8@amazonaws.com Male 108.205.40.64 Guadeloupe 12/30/1992 60618.9 Developer II
1454485649 10 Wanda Walker wwalker9@latimes.com Female 246.214.98.78 3539421569669478 Portugal 137664.53
1454517563 11 Kathryn Weaver kweavera@bizjournals.com Female 157.237.161.75 201425019338900 Sweden 117572.65
1454482256 12 Philip Ward pwardb@sakura.ne.jp Male 77.140.225.69 201508031789224 Greece 9/3/1984 238925.79 Human Resources Manager
1454542618 13 Evelyn Harvey eharveyc@time.com 254.174.154.7 3539535868968594 China 5/15/1979 \N Software Engineer III
1454484804 14 Andrea Lane alaned@gov.uk Female 192.253.116.192 5100174455306952 Indonesia 1/19/1989 166778.42 Operator
1454507104 15 Bobby Vasquez bvasqueze@furl.net Male 126.60.18.195 3581051861650673 Philippines 1/25/1975 138184.83 Senior Editor
1454536690 16 Kenneth Gibson kgibsonf@soundcloud.com Male 91.153.142.170 5389947292571488 Peru 11/3/1975 98614.53 Environmental Tech
1454516554 17 Emily Hill ehillg@house.gov Female 109.107.174.205 Palestinian Territory 5/18/1956 218781.48 Executive Secretary
1454541649 18 Kelly Fowler kfowlerh@dell.com Female 147.58.88.116 3551741291105936 Greece 6/11/1975 117249.56 Systems Administrator III
1454524126 19 Diana Howell dhowelli@sphinn.com Female 21.240.75.42 4026635872860296 Iran 7/7/1993 174844.52 Teacher
1454466206 20 Johnny Collins jcollinsj@google.ca Male 38.173.129.250 372301677387203 Afghanistan 7/28/1987 155908.69 Social Worker
1454493912 21 Frank Bradley fbradleyk@shinystat.com Male 186.9.38.46 4913033819988246 Czech Republic 211051.83
1454509391 22 Billy Thomas bthomasl@moonfruit.com Male 143.89.197.162 4026052248187794 Czech Republic 10/7/1991 282061.72 Professor 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍
1454523133 23 Philip Moreno pmorenom@rambler.ru Male 9.39.210.239 4041597502244971 United States 122560.95
1454536839 24 Billy Ray brayn@meetup.com Male 230.255.220.160 201925598515489 Kazakhstan 2/9/1966 130424.35 VP Accounting 사회과학원 어학연구소
1454509252 25 Ryan Wilson rwilsono@forbes.com Male 197.77.142.137 Poland 7/4/1961 280703.91 Software Test Engineer III
1454458024 26 Sandra Coleman scolemanp@blogger.com Female 230.159.39.252 3555708337891155 China 8/7/1971 113688.11 VP Sales
1454513250 27 Evelyn Moreno emorenoq@chronoengine.com Female 126.96.111.52 3557508895347766 United States 8/17/1990 167131.57 Recruiting Manager
1454509036 28 Elizabeth Warren ewarrenr@flavors.me Female 213.8.204.211 67099385430526802 China 6/14/1996 119515.12 Media Manager II
1454541241 29 Linda Hawkins lhawkinss@fotki.com Female 206.6.3.196 4913079795915711 Philippines 2/14/1961 107779.93 Technical Writer
1454493935 30 Janice Day jdayt@devhub.com Female 243.24.120.209 Ukraine 6/9/1972 53906.4 Marketing Manager
1454483872 31 Diane Perez dperezu@ihg.com Female 182.136.218.77 Belarus 2/9/1957 170326.91 Chief Design Engineer
1454529216 32 Bruce Robinson brobinsonv@redcross.org Male 5.126.135.106 201769377515751 Philippines 169520.45
1454470160 33 Daniel Lawrence dlawrencew@usgs.gov Male 200.168.191.214 4911581295367856744 United States 5/7/1967 199535.76 VP Sales
1454474809 34 Theresa James tjamesx@quantcast.com Female 83.122.166.224 3545570545148759 Russia 104683.19
1454536922 35 Scott Russell srusselly@printfriendly.com Male 92.233.3.208 Bolivia 205730.41
1454514354 36 Ruby Vasquez rvasquezz@toplist.cz Female 8.148.83.49 France 11/5/1999 95407.16 Financial Advisor
1454524074 37 Jeffrey Hall jhall10@pagesperso-orange.fr Male 91.103.226.35 3531476231658075 Indonesia 5/29/1987 247716.37 Business Systems Development Analyst
1454477697 38 Debra Kennedy dkennedy11@state.tx.us Female 116.247.236.130 676732277565853203 Mexico 5/22/1955 272563.67 Desktop Support Technician
1454464041 39 Cole Male 157.157.28.86 4911512925983388490 Panama 91174.63
1454521471 40 Helen Sanchez hsanchez13@oakley.com Female 222.122.74.77 Venezuela 2/11/1969 189240.59 Food Chemist
1454527305 41 Jennifer Russell jrussell14@cpanel.net Female 42.82.215.191 Morocco 80644.64 1E02
1454479360 42 Fred Marshall fmarshall15@ifeng.com 160.92.143.233 6374102245574313 China 12/18/1984 \N Structural Engineer
1454464402 43 Terry Ford tford16@shop-pro.jp Male 169.34.131.192 3588107849306045 Turkmenistan 286388.01
1454468866 44 Maria Mason mmason17@miibeian.gov.cn Female 213.62.60.224 060438374765421941 Sweden 7/6/1973 34664.91 Social Worker
1454486568 45 Sharon Schmidt sschmidt18@istockphoto.com Female 111.247.11.124 5100179876769597 Argentina 10/4/1982 150142.49 Mechanical Systems Engineer
1454483332 46 Gregory Jones gjones19@jimdo.com Male 132.88.44.128 30372001476487 China 12/31/1972 240265.01 Design Engineer
1454520829 47 Raymond Moore rmoore1a@arizona.edu 89.39.221.170 5602248693774107 Japan 4/24/1956 \N VP Sales
1454531788 48 Tammy Scott tscott1b@mlb.com Female 236.12.148.59 3577211980737555 Peru 10/14/1959 132064.01 Software Consultant
1454480004 49 Willie Alexander walexander1c@home.pl Male 2.199.150.177 Brazil 10/14/1958 26424.57 Executive Secretary `ィ(´∀`∩
1454473891 50 William Garrett wgarrett1d@java.com Male 20.24.142.67 Croatia 10/9/1963 181424.2 Database Administrator III
1454463118 51 Patricia Peterson ppeterson1e@cpanel.net Female 77.242.54.160 3585161324543005 Peru 3/5/1987 176561.19 Media Manager III
1454488118 52 Andrew Cook acook1f@ftc.gov Male 220.139.174.228 6333320102003586 Bolivia 3/8/1969 185775.61 Computer Systems Analyst III
1454536072 53 Carol Nichols cnichols1g@statcounter.com Female 233.176.31.182 3543580855019963 Nigeria 1/6/1960 105346.38 Compensation Analyst
1454489053 54 Jimmy Morales jmorales1h@archive.org Male 199.160.215.73 3587538933267985 Kiribati 8/25/1961 146625.62 Assistant Media Planner
1454538033 55 Nancy Montgomery nmontgomery1i@freewebs.com Female 11.235.20.56 3586137339728301 China 128631.29 $1.00
1454461902 56 Thomas Freeman tfreeman1j@java.com Male 161.123.216.250 3536920916224146 Colombia 8/4/1973 239571.27 Senior Developer
1454488504 57 Virginia Bell vbell1k@aboutads.info Female 79.142.13.145 3585595583423005 Malaysia 4/2/1998 252007.47 Actuary
1454496671 58 Tammy Adams tadams1l@virginia.edu Female 106.207.61.165 3528072249217643 Canada 1/26/1973 98463.77 Business Systems Development Analyst
1454516066 59 Cynthia Robertson crobertson1m@alibaba.com Female 106.110.239.97 Belarus 12/20/1962 90950.39 Help Desk Technician
1454523801 60 Steven Romero sromero1n@usa.gov Male 65.249.97.254 5007669084530801 Argentina 9/27/1963 14358.32 Quality Control Specialist
1454458452 61 Sean Greene sgreene1o@goo.gl Male 71.195.178.59 5602246313163081 China 2/20/1991 70656.63 Sales Representative
1454537851 62 Jerry Turner jturner1p@scribd.com Male 69.148.19.138 3561778321182616 New Zealand 5/25/1991 89186 Information Systems Manager
1454523562 63 Jennifer Mendoza jmendoza1q@shutterfly.com Female 54.114.8.9 3544098267391200 Russia 7/8/1973 263720.16 General Manager
1454477002 64 Roy Hughes rhughes1r@stanford.edu Male 209.120.70.78 3552886646968253 Canada 10/30/1968 191750.33 Mechanical Systems Engineer
1454477109 65 Susan Jenkins sjenkins1s@princeton.edu Female 247.155.65.12 Philippines 3/1/1967 86339.04 VP Sales
1454527329 66 Norma Dunn ndunn1t@pen.io Female 250.241.78.109 China 7/20/1967 77739.6 Web Designer I
1454461701 67 Tina Reid treid1u@163.com Female 116.38.145.226 Germany 4/25/1967 228301.51 Financial Analyst
1454478121 68 Cynthia Daniels cdaniels1v@pinterest.com Female 17.140.57.238 3589952234971047 Burundi 1/9/1956 42221.96 Research Nurse
1454462100 69 Wells Male 92.13.7.20 Philippines 7/4/1969 78486.77 Tax Accountant
1454516337 70 Stephen Butler sbutler1x@moonfruit.com Male 230.147.124.190 Argentina 125060.01
1454459366 71 Jacqueline Wallace jwallace1y@dagondesign.com Female 203.83.140.84 3578315582149538 Turkmenistan 4/15/1997 89436.49 Cost Accountant
1454479818 72 Carol Dunn cdunn1z@ocn.ne.jp Female 241.2.84.72 5602252003430282308 Bulgaria 2/1/1981 203473.36 Geological Engineer
1454505977 73 Russell Williams rwilliams20@imgur.com Male 21.217.68.126 3566925409646658 Slovenia 1/30/1977 252402.64 Librarian
1454476392 74 Kathryn Torres ktorres21@rakuten.co.jp Female 4.124.222.88 4026779356659103 Portugal 7/31/1956 121285.58 Project Manager
1454463675 75 Larry Mason lmason22@alibaba.com Male 172.104.78.232 3587717468815331 Sweden 4/20/1969 248583.77 Professor
1454517479 76 Rachel Dunn rdunn23@hugedomains.com Female 101.213.94.161 6374938227969686 Peru 6/18/1999 79245.45 Chief Design Engineer
1454457675 77 Doris Elliott delliott24@shinystat.com Female 36.27.140.126 Portugal 9/23/1987 98288.74 Design Engineer
1454483215 78 William Mendoza wmendoza25@prlog.org Male 71.28.136.31 3580069171786970 China 3/20/1967 81965.94 Media Manager II "ثم نفس سقطت وبالتحديد،
1454504790 79 Elizabeth Payne epayne26@about.me Female 40.237.87.45 337941052859146 Estonia 49661.99
1454481311 80 Dennis Robertson drobertson27@w3.org Male 189.45.163.164 Italy 5/2/1972 19984.47 Web Developer III
1454514914 81 Edward Little elittle28@mozilla.org Male 114.189.184.212 South Korea 11/19/1984 141645.22 Senior Sales Associate ../../../../../../../../../../../etc/passwd%00
1454530264 82 Roy Tucker rtucker29@vistaprint.com Male 254.148.189.172 Portugal 285617.13
1454510066 83 Matthew Gardner mgardner2a@wix.com Male 91.23.27.42 5602247355547230028 Brazil 1/18/1977 267617.18 Actuary
1454535958 84 Anthony Palmer apalmer2b@uol.com.br 25.228.124.126 3561410660537354 China 7/4/1974 \N Human Resources Assistant III
1454460668 85 John Hudson jhudson2c@rediff.com Male 75.191.191.171 3538638405093479 Georgia 6/22/1994 82621.71 Tax Accountant
1454479399 86 Jonathan Mills jmills2d@mail.ru Male 224.145.163.163 36504499928546 Philippines 77260.7 00˙Ɩ$-
1454491670 87 Christine Jackson cjackson2e@feedburner.com Female 8.207.125.219 Philippines 6/12/1964 32832.61 Occupational Therapist
1454475253 88 Eric Fernandez efernandez2f@artisteer.com Male 246.217.21.160 France 124825.77
1454483421 89 Heather Diaz hdiaz2g@tmall.com Female 220.248.165.145 502080553226612964 China 7/26/1966 280714.33 Food Chemist
1454515874 90 Nicole Reid nreid2h@cisco.com Female 10.75.131.59 5610704755842409780 Philippines 12/15/1985 24922.19 Marketing Assistant
1454542340 91 Donald Murphy dmurphy2i@fema.gov Male 127.141.234.199 China 4/10/1977 76449.81 Cost Accountant
1454531823 92 Steven Wagner swagner2j@go.com Male 211.154.182.230 United Kingdom 249411.22
1454539859 93 Ruth Alvarez ralvarez2k@sciencedaily.com 240.195.230.204 South Korea 7/11/1964 \N Senior Developer
1454462055 94 Carl Oliver coliver2l@cafepress.com Male 199.184.71.24 China 6/26/1967 215279.38 Operator (╯°□°)╯︵ ┻━┻)
1454457982 95 Teresa Ruiz truiz2m@diigo.com Female 22.118.240.24 337941028849437 Brazil 7/25/1994 243603.67 Cost Accountant
1454465475 96 Kathryn Carter kcarter2n@fastcompany.com Female 203.255.226.40 Greece 1/23/1969 34951.57 Registered Nurse
1454542755 97 Fred Perry fperry2o@imgur.com 46.52.134.142 3544236333368634 Indonesia 2/6/1966 \N Programmer Analyst III
1454477885 98 Harry Perkins hperkins2p@domainmarket.com Male 235.202.132.85 374288817366643 Russia 1/9/1962 167340.53 Physical Therapy Assistant
1454509699 99 Bobby Hicks bhicks2q@wix.com Male 253.252.57.121 3555445397654443 United States 8/10/1964 238304.33 Quality Control Specialist Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮
1454515572 100 Tammy Dunn tdunn2r@list-manage.com Female 162.156.75.67 Brazil 4/24/1980 163106.38 Sales Representative
=== Try load data from userdata3.parquet
1454515666 1 Ernest Fuller efuller0@examiner.com Male 106.72.28.74 5610608195667267 Israel 140639.36
1454536327 2 Anthony Foster afoster1@weibo.com Male 156.243.130.166 4508242795214771 Indonesia 1/16/1998 172843.61 Developer II 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍
1454466139 3 Ryan Montgomery rmontgomery2@mozilla.org Male 28.55.168.128 Colombia 11/21/1978 204620.66 Developer I ␢
1454473204 4 Brenda Nelson bnelson3@photobucket.com Female 185.81.160.85 Guatemala 10/29/1998 260474.12 GIS Technical Architect
1454458516 5 Jacqueline Ellis jellis4@amazon.com Female 158.137.238.6 Russia 7/12/1959 286038.78 Marketing Assistant
1454528894 6 Paul Ferguson pferguson5@gmpg.org Male 141.122.136.144 30501574577558 Thailand 241518.24
1454489945 7 Linda Hunt lhunt6@prlog.org Female 104.179.97.82 Russia 3/30/1988 192756.38 Professor
1454486691 8 Frances Kim fkim7@blog.com Female 28.77.158.48 676306013856639159 Indonesia 188511.28 <svg><script>0<1>alert(\'XSS\')</script>
1454487153 9 Jason Matthews jmatthews8@google.co.uk Male 72.129.239.24 3534550235909507 China 7/29/1982 238068.56 Web Designer III
1454519282 10 Carolyn Elliott celliott9@cpanel.net Female 51.211.70.30 3563436733386899 Indonesia 4/28/1977 132718.26 Research Nurse
1454473379 11 Thomas Mills tmillsa@psu.edu Male 104.114.227.199 5018278895598921190 Russia 236386.69
1454534367 12 Russell Lee rleeb@howstuffworks.com Male 193.165.137.217 China 280252.36 🐵 🙈 🙉 🙊
1454525264 13 Chris Bailey cbaileyc@redcross.org Male 246.109.118.154 30485245023962 Thailand 11/26/1970 200218.34 Research Assistant I
1454457712 14 Eric Parker eparkerd@usa.gov Male 25.73.91.135 5602249431899032 Russia 8/12/1986 102832.54 Tax Accountant
1454526788 15 Anne Robertson arobertsone@geocities.jp Female 209.77.27.30 Armenia 168201.04  
1454494278 16 Angela Gonzalez agonzalezf@state.gov Female 118.77.43.191 Sweden 7/1/1972 161220.37 Database Administrator I
1454488522 17 Edward Moreno emorenog@hp.com Male 200.50.125.67 3559979696602303 France 8/17/1966 144551.41 Chief Design Engineer
1454496145 18 Roy Murray rmurrayh@sphinn.com Male 91.52.226.221 3546330084792460 Portugal 285872.87 𠜎𠜱𠝹𠱓𠱸𠲖𠳏
1454492939 19 Louis Willis lwillisi@hp.com 14.132.82.250 Philippines 8/1/1980 \N Director of Sales
1454530172 20 Edward Perez eperezj@china.com.cn Male 24.152.201.59 3571014044514515 Indonesia 29515.23
1454518522 21 Nicole Price npricek@cpanel.net Female 4.21.204.142 Peru 5/8/1978 154023.3 Office Assistant III
1454496552 22 Virginia Nichols vnicholsl@ning.com Female 160.202.18.170 30166467912021 Greece 5/10/1966 145509.34 Programmer II
1454474290 23 Katherine Roberts krobertsm@hostgator.com Female 247.21.118.188 Cuba 192723.43
1454522256 24 Emily Sullivan esullivann@sakura.ne.jp Female 33.152.103.14 4074771539744796 Indonesia 6/28/1965 36127.55 VP Sales
1454527958 25 Susan Turner sturnero@google.pl 150.94.47.96 374283138983226 United States \N
1454540961 26 Fred Jenkins fjenkinsp@walmart.com Male 219.195.7.86 China 3/23/1965 69388.75 Human Resources Assistant I
1454496916 27 Jane Torres jtorresq@photobucket.com Female 147.220.219.158 5002353015111222 Indonesia 9/29/1997 226788.25 Occupational Therapist
1454508711 28 Louis Patterson lpattersonr@wp.com Male 158.176.255.43 5100145505218793 China 9/20/1993 30309.45 VP Quality Control
1454538643 29 Brandon Wagner bwagners@slashdot.org Male 124.203.101.37 6771208405057819279 Iraq 10/3/1959 95522.88 Research Associate
1454484725 30 Amy Jenkins ajenkinst@wikia.com Female 21.0.126.111 3542005201579396 Ethiopia 9/26/1984 167682.84 Tax Accountant """\'""\'""\'\'\'"""
1454513613 31 Timothy Frazier tfrazieru@toplist.cz 100.218.94.178 China 5/17/1963 \N Director of Sales 0.00
1454463548 32 Phillip Meyer pmeyerv@live.com Male 184.208.76.39 3541248561759148 France 11/3/1974 245572.41 Nurse
1454528692 33 Joe Wallace jwallacew@mail.ru Male 167.122.66.246 5602246900361320 Russia 64311.11
1454466352 34 Walter Rivera wriverax@de.vu Male 67.169.221.120 5366484318587717 Russia 1/28/1983 271690.8 Programmer Analyst I
1454480715 35 Lois Mcdonald lmcdonaldy@paypal.com 44.140.199.251 Portugal \N
1454499439 36 William Edwards wedwardsz@acquirethisname.com Male 69.187.29.7 3528411636358679 Egypt 2/23/1958 252476.42 Financial Analyst Œ„´‰ˇÁ¨ˆØ∏”’
1454460587 37 Frank Stevens fstevens10@samsung.com Male 61.182.84.178 Philippines 3/19/1958 47326.14 VP Product Management
1454536874 38 Albert Martinez amartinez11@godaddy.com Male 76.139.124.119 Ukraine 11/11/1994 57220.55 Software Engineer III
1454504601 39 Stephanie Stewart sstewart12@elpais.com Female 104.98.138.203 4905603900430425379 Syria 2/11/1975 250118.59 Developer I
1454521301 40 Annie Stevens astevens13@slate.com Female 214.146.163.79 3553338148582934 South Africa 11/8/1983 12963.52 Systems Administrator I -1E2
1454460788 41 Joyce Butler jbutler14@csmonitor.com Female 88.243.175.236 Indonesia 135825.27
1454460615 42 Carlos Armstrong carmstrong15@technorati.com Male 85.22.216.153 3532000356234436 Indonesia 23446.58
1454537073 43 Frances Kelly fkelly16@springer.com Female 146.38.150.164 4026344347458956 China 242916.36
1454507861 44 Amanda Pierce apierce17@phpbb.com Female 214.208.248.216 201678379872880 Faroe Islands 6/1/1990 38037.1 Software Test Engineer II test
1454464352 45 Alan Torres atorres18@histats.com Male 117.124.224.32 4844818559255911 Israel 114759.77
1454528513 46 Nancy Brown nbrown19@lycos.com Female 98.103.84.222 4041378619584967 Portugal 9/16/1972 170596.79 GIS Technical Architect
1454518979 47 Kenneth Larson klarson1a@cnet.com Male 71.35.49.21 Philippines 2/3/1990 178010.01 Staff Scientist
1454536052 48 Thomas Lawson tlawson1b@canalblog.com Male 209.50.87.12 50201361710870252 Ukraine 10/5/1987 35118.14 Software Test Engineer II
1454488725 49 Debra Gomez dgomez1c@lycos.com Female 26.107.134.220 30508009555281 China 9/10/1979 129186.15 Electrical Engineer
1454489047 50 Deborah Price dprice1d@google.nl Female 207.145.225.232 4055636387933119 Russia 1/26/1983 165945.4 Dental Hygienist ␡
1454478467 51 Diane Banks dbanks1e@wikispaces.com Female 22.253.228.131 China 39139.44
1454468949 52 Marie Woods mwoods1f@bbc.co.uk 41.109.183.128 Russia 2/20/1989 \N Human Resources Manager
1454489570 53 Randy Romero rromero1g@tamu.edu Male 134.90.91.230 Indonesia 11/30/1960 230039.26 Professor
1454528266 54 Brandon Fox bfox1h@ocn.ne.jp Male 157.130.211.215 6391404048298002 China 2/1/1979 223567.43 Programmer III
1454513948 55 Albert Smith asmith1i@jalbum.net Male 167.84.86.133 3530479136988416 Ukraine 263457.42
1454467976 56 Jeremy Black jblack1j@sphinn.com Male 181.85.144.139 Poland 194896.66
1454463146 57 Marilyn Shaw mshaw1k@bloomberg.com Female 141.42.43.91 30110642387063 China 178473.04
1454540383 58 Stephanie Diaz sdiaz1l@who.int Female 127.174.128.199 3571927033182087 Indonesia 3/25/1974 135570.75 Paralegal
1454492347 59 Christopher Reynolds creynolds1m@sun.com Male 81.89.26.14 China 5/29/1956 147519.69 Account Executive
1454529565 60 Douglas Holmes dholmes1n@weather.com Male 99.22.29.208 Honduras 11/29/2000 45372.51 VP Accounting œ∑´®†¥¨ˆøπ“‘
1454485707 61 Howard Rogers hrogers1o@sciencedirect.com Male 222.229.220.65 Ukraine 2/26/1995 143231.21 Account Executive
1454489894 62 Melissa Washington mwashington1p@cmu.edu Female 32.151.71.144 374288910553246 Czech Republic 2/24/1966 266547.15 Human Resources Manager
1454541195 63 Margaret Flores mflores1q@usnews.com Female 108.42.248.249 France 8/25/1999 110594.3 Data Coordiator
1454458233 64 Rose Fernandez rfernandez1r@usgs.gov Female 199.141.221.229 3564435193511524 Brazil 5/5/1972 196329.18 Senior Cost Accountant
1454472500 65 Julie Mendoza jmendoza1s@unesco.org Female 137.192.7.121 3586331607810566 Cuba 149157.14
1454515883 66 Earl Sanders esanders1t@github.com Male 179.122.203.141 3561742181897127 Vietnam 215545.14 𠜎𠜱𠝹𠱓𠱸𠲖𠳏
1454460569 67 Eric Armstrong earmstrong1u@arizona.edu Male 128.202.252.112 4041590574307 Indonesia 5/30/1973 75347.18 Web Designer II
1454532395 68 Joyce Perez jperez1v@dmoz.org Female 145.86.183.96 Canada 3/29/1975 115579.36 Director of Sales
1454524697 69 Sanchez Female 100.163.22.106 Russia 127045.66
1454489862 70 Laura Romero lromero1x@godaddy.com Female 237.131.116.77 3539134691869631 Madagascar 12/20/1957 208213.96 Business Systems Development Analyst
1454538359 71 Maria Thomas mthomas1y@lycos.com Female 12.113.23.220 5602229580950679 China 10/29/1990 88961.11 Nurse
1454520121 72 Victor Romero vromero1z@reference.com Male 208.79.116.61 6767842086446946518 Brazil 209207.14
1454510241 73 Betty Hayes bhayes20@goo.ne.jp Female 153.254.225.4 201881044698306 Jordan 3/9/1970 173372.32 VP Accounting
1454465142 74 Roger Jacobs rjacobs21@rediff.com Male 51.122.147.153 36548589951538 Benin 7/18/1977 18545.32 Paralegal 1/2
1454470850 75 Ruth Thompson rthompson22@reuters.com Female 220.41.116.217 67067442144878124 Croatia 6/30/1972 167279 Account Executive ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ
1454515259 76 Theresa James tjames23@un.org Female 31.135.76.146 China 12/28/1974 188732.88 Financial Advisor
1454517695 77 Pamela Collins pcollins24@nih.gov Female 21.45.74.249 490591529416018576 Moldova 7/28/1998 252394.72 Marketing Assistant 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧
1454523543 78 Adam Ward award25@telegraph.co.uk Male 242.85.131.30 201794641891036 Brazil 276446.24
1454458334 79 Robin Price rprice26@jugem.jp Female 235.141.108.176 5610389618618837 Russia 1/7/1977 120293.75 Biostatistician IV
1454529469 80 Barbara Ryan bryan27@usda.gov Female 58.0.103.48 30526192141883 Philippines 198959.68
1454497076 81 Melissa Gibson mgibson28@census.gov Female 54.212.104.159 3529828486403520 Bhutan 7/29/1990 224163.74 Senior Developer
1454467979 82 Carolyn Morris cmorris29@cbslocal.com Female 86.106.24.230 Portugal 2/12/1958 87727.95 Quality Engineer 0.00
1454484623 83 Stephen Harris sharris2a@un.org Male 247.19.48.100 Russia 4/9/1983 284559.55 Product Engineer ١٢٣
1454476730 84 Linda Campbell lcampbell2b@mapy.cz Female 28.62.77.24 6759510168753943 Peru 2/27/1982 16435.84 VP Quality Control ␡
1454463822 85 Brian Daniels bdaniels2c@ovh.net Male 143.36.66.196 Ecuador 7/6/1966 148952.4 Information Systems Manager
1454458337 86 West Female 247.72.186.254 3541609903446548 Indonesia 12/11/1984 132544.98 Physical Therapy Assistant
1454518267 87 Timothy Moore tmoore2e@printfriendly.com Male 109.229.170.253 Samoa 42697.58
1454523368 88 Eric Walker ewalker2f@mozilla.com Male 243.173.35.155 Thailand 5/29/1970 48715.81 Engineer IV
1454486082 89 Maria Arnold marnold2g@google.com.br Female 58.58.77.228 3589928770150089 Uruguay 3/14/1956 64067 Geological Engineer
1454541738 90 Edward Garza egarza2h@moonfruit.com Male 43.21.138.236 New Zealand 3/27/1965 139025.58 Structural Analysis Engineer
1454490484 91 Alice Young ayoung2i@typepad.com Female 120.255.189.145 630468343049978318 Serbia 4/18/1981 17663.49 Automation Specialist I
1454512586 92 Kenneth Powell kpowell2j@unicef.org Male 238.251.71.34 3586683330377036 Philippines 2/10/1955 68010.82 Social Worker
1454472784 93 Kelly Bell kbell2k@hud.gov Female 176.210.241.20 Russia 11/17/1984 57640.41 Web Developer I  
1454490007 94 David Garcia dgarcia2l@tmall.com Male 100.18.61.166 Paraguay 201297.71
1454504627 95 Maria Harvey mharvey2m@nydailynews.com Female 192.209.117.213 67593619471737741 Mongolia 283649.67
1454505519 96 Chris Hall chall2n@imageshack.us Male 241.96.162.44 5594268668744901 Russia 1/3/1964 67656.08 Web Designer II
1454481847 97 Roger Simpson rsimpson2o@nymag.com Male 80.110.89.28 493618903455317947 Indonesia 76354.79
1454515032 98 Richard Nelson rnelson2p@simplemachines.org Male 43.54.4.82 Brazil 237205.58 NIL
1454461907 99 Ruth Howell rhowell2q@cornell.edu Female 190.170.191.14 China 5/2/1969 286113.38 Senior Quality Engineer
1454524115 100 Judith Garza jgarza2r@usnews.com Female 204.216.154.40 Ecuador 6/22/1962 256786.42 Teacher
=== Try load data from userdata4.parquet
1454599685 1 Howard Morgan hmorgan0@typepad.com 158.178.195.62 Colombia 12/2/1992 \N Data Coordiator
1454581720 2 Jessica Schmidt jschmidt1@google.com Female 168.118.247.35 3565285464047941 Luxembourg 4/14/1995 222396.46 Research Nurse nil
1454608896 3 Beverly Flores bflores2@wikipedia.org Female 51.97.88.173 Sweden 2/15/1965 141112.8 Actuary
1454575874 4 Marilyn Sanchez msanchez3@intel.com Female 186.206.142.162 China 8/6/1969 87914.29 Structural Engineer
1454567588 5 Janice Mitchell jmitchell4@sina.com.cn Female 205.187.116.241 5610719759939376962 Poland 7/4/1995 269297.4 Systems Administrator I
1454545227 6 William Williamson wwilliamson5@trellian.com Male 44.86.73.201 201849487683564 Indonesia 12/6/1993 95352.25 Librarian 1E+02
1454602212 7 Jack James jjames6@sogou.com Male 59.184.76.208 3552911855395632 Indonesia 11/25/1968 82549.73 Compensation Analyst test
1454556325 8 Jesse Arnold jarnold7@soup.io Male 7.25.90.13 5100177285965756 Brazil 10/19/1987 257968.86 Executive Secretary
1454622627 9 Lori Woods lwoods8@fastcompany.com Female 147.157.215.9 4844532485570190 Indonesia 12/26/1975 186145.91 Health Coach I
1454601455 10 Juan Evans jevans9@zimbio.com Male 150.132.218.181 3578802610769023 Philippines 5/29/1988 129369.52 Social Worker
1454579490 11 Roy Matthews rmatthewsa@ucsd.edu Male 203.239.85.224 5100135134598509 Russia 192057.84
1454586145 12 Kenneth King kkingb@zimbio.com 9.103.96.206 675913564329481832 Greece \N
1454568600 13 Raymond Green rgreenc@fc2.com Male 163.9.101.43 United States 1/28/1984 225094.01 Budget/Accounting Analyst III
1454603300 14 Lillian Stephens lstephensd@psu.edu Female 31.50.183.23 630455284969060148 Finland 6/1/1973 19354.85 Information Systems Manager
1454560697 15 Mary Gonzales mgonzalese@wired.com Female 91.42.17.109 3560985473023370 France 5/7/1966 23746.36 Compensation Analyst
1454561895 16 Roger Mason rmasonf@newyorker.com Male 169.33.172.204 3545036194973129 Norway 165855.47
1454604198 17 Diane Cole dcoleg@unesco.org Female 157.11.85.209 Philippines 6/9/1994 105028.67 Assistant Manager
1454601270 18 Annie Hunt ahunth@ocn.ne.jp Female 169.47.232.187 5100177440436305 Poland 3/30/1992 266071.6 Legal Assistant
1454600872 19 Jacqueline Bradley jbradleyi@epa.gov Female 83.241.214.77 5100131814165289 Indonesia 12/1/1971 55440.88 Dental Hygienist
1454600248 20 Kathy Russell krussellj@joomla.org Female 158.32.89.44 3585627581021729 Indonesia 11/20/1999 29602.23 Sales Representative
1454551378 21 Beverly Barnes bbarnesk@europa.eu Female 189.157.45.179 3548552521258155 Bulgaria 4/21/1956 37295.89 Human Resources Assistant II
1454604764 22 Roy Morris rmorrisl@scribd.com 201.51.139.86 China \N
1454569146 23 Alice Ramos aramosm@utexas.edu Female 185.168.142.9 374622349140748 Philippines 4/20/1966 138021.54 Paralegal
1454597325 24 Todd Kelly tkellyn@fotki.com Male 46.19.203.86 4041599550654 Portugal 3/14/1998 84343.96 Executive Secretary () { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; }
1454551797 25 Lawrence Ramos lramoso@imageshack.us Male 5.96.81.47 5010121401502407 Palestinian Territory 1/26/1994 265545.92 Operator
1454605654 26 Jennifer Rogers jrogersp@so-net.ne.jp Female 31.48.54.193 5610097864736794573 Yemen 6/5/1992 138365.1 Computer Systems Analyst II
1454603775 27 Kimberly Morgan kmorganq@seesaa.net Female 154.61.255.47 China 14486.75 0/0
1454606635 28 Jessica Marshall jmarshallr@mtv.com Female 164.101.35.148 3531025977662047 Brazil 7/2/1987 216211.96 VP Accounting
1454597817 29 Katherine Gordon kgordons@phoca.cz Female 248.30.182.15 5602230546469168 Italy 10/11/1956 48478.51 Librarian
1454557995 30 Jennifer Phillips jphillipst@pcworld.com Female 61.30.215.16 5100179891124018 Sweden 9/3/1967 254808.27 Software Consultant
1454613512 31 Gerald Nguyen gnguyenu@seesaa.net Male 9.13.167.17 67717376159922001 China 9/3/1972 285571.49 Tax Accountant
1454625134 32 Rose Ellis rellisv@walmart.com Female 250.88.7.15 3580333318847248 China 4/23/1987 47695.25 Systems Administrator II 和製漢語
1454622672 33 Margaret Grant mgrantw@bbb.org Female 227.165.116.192 3565645038486711 Slovenia 12/10/1992 106452.61 Account Coordinator
1454568796 34 Jessica Wells jwellsx@blogtalkradio.com Female 185.189.187.186 Azerbaijan 9/13/1996 173164.24 Project Manager
1454582324 35 Henry Jenkins hjenkinsy@mit.edu Male 10.83.90.235 5602221853972654 China 11/12/1975 25740.85 Recruiter 田中さんにあげて下さい
1454545876 36 Earl Mccoy emccoyz@bigcartel.com Male 161.179.122.154 5038877150819047588 Japan 10/12/1976 114766.43 Software Test Engineer IV 0.00
1454618571 37 Paul Knight pknight10@google.cn Male 182.38.37.173 5020715558032859593 Ukraine 10/25/1971 199366 Social Worker
1454576590 38 Martha Clark mclark11@usda.gov 189.166.203.239 South Korea \N
1454601033 39 Clarence Bryant cbryant12@bigcartel.com Male 120.218.175.241 Poland 9/1/1968 257075.65 Professor 田中さんにあげて下さい
1454548319 40 Joan Price jprice13@mtv.com 233.4.158.135 3584182571037112 Portugal \N
1454573152 41 Anthony Ford aford14@chicagotribune.com Male 100.240.61.163 Iran 6/26/1992 152800.71 Senior Cost Accountant
1454595667 42 Roger Henderson rhenderson15@sitemeter.com Male 206.185.213.252 3560757094744860 Brazil 6/26/1970 40949.78 Nurse
1454591751 43 Kenneth Butler kbutler16@youtu.be Male 2.12.57.207 3586795027670612 Thailand 3/26/1987 165121.43 Research Assistant IV
1454566774 44 Kenneth Wright kwright17@google.de Male 241.213.136.95 5602246924892961 Belarus 10/15/1995 227583.86 Speech Pathologist
1454617513 45 Aaron Smith asmith18@flickr.com Male 185.244.9.145 China 11/25/1972 286108.94 Paralegal
1454574169 46 Amy Matthews amatthews19@t.co Female 206.172.83.152 5002357749310919 China 39365.73
1454586102 47 Janet Cooper jcooper1a@dailymotion.com Female 9.148.129.197 Comoros 8/2/1968 168391.72 Senior Cost Accountant
1454601994 48 Russell Stewart rstewart1b@edublogs.org Male 113.23.229.63 675993663890158630 Thailand 4/17/1963 57609.96 Senior Editor
1454582839 49 Howard Elliott helliott1c@illinois.edu Male 225.208.151.89 3577055641640512 Mongolia 176999.03
1454573932 50 Keith Lane klane1d@eventbrite.com Male 250.24.9.55 Russia 5/27/1983 80452.19 Budget/Accounting Analyst II
1454583292 51 Jimmy Richardson jrichardson1e@vimeo.com Male 152.87.188.99 China 6/30/1960 194774.28 Assistant Manager ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙
1454623280 52 Justin Bryant jbryant1f@github.com Male 245.48.63.169 3562259518717901 Guatemala 10/28/1960 144419.21 Database Administrator III
1454582337 53 Ruby Allen rallen1g@cyberchimps.com Female 238.148.148.156 3541217939068433 Japan 248388.64
1454578101 54 Ward Male 120.88.247.59 Russia 125075.78
1454546163 55 Nancy Stephens nstephens1i@godaddy.com Female 211.0.225.116 Mongolia 20805.69
1454580277 56 Dorothy Kennedy dkennedy1j@mlb.com Female 177.229.94.96 Indonesia 3/26/1984 118098.45 Legal Assistant
1454597567 57 Katherine Ferguson kferguson1k@google.cn Female 185.67.150.20 5038883804496681778 Russia 1/28/1982 255040.89 Chemical Engineer
1454609494 58 Norma Daniels ndaniels1l@adobe.com Female 72.161.56.76 5602256058813840 Lithuania 5/30/1986 228396.52 Junior Executive
1454549169 59 John Rogers jrogers1m@miitbeian.gov.cn Male 91.131.170.178 3578552255653202 Croatia 9/25/1971 164207.53 Administrative Assistant III
1454627177 60 Lisa Nguyen lnguyen1n@phpbb.com Female 99.51.36.31 3587343436670904 Ghana 6/10/1970 213963.71 Research Nurse
1454564279 61 Roy Carter rcarter1o@cmu.edu Male 154.176.171.103 3581163353975466 Germany 7/21/1980 216294.79 Marketing Manager
1454546835 62 Donna Gonzalez dgonzalez1p@instagram.com Female 81.57.136.186 China 3/3/1975 181562.45 Junior Executive
1454610240 63 Medina Female 84.135.250.216 3579667388606106 Indonesia 7/18/1958 80267.81 Accounting Assistant III
1454613635 64 Samuel Bishop sbishop1r@npr.org Male 87.38.89.122 3534693555244475 Indonesia 97009.57
1454551032 65 Jerry Bradley jbradley1s@umn.edu Male 184.79.105.210 5602258009829107 China 3/13/1984 50863.85 Junior Executive
1454555641 66 Ralph Castillo rcastillo1t@nba.com Male 96.246.167.130 6373313274491359 United States 5/14/1986 13099.91 Health Coach III
1454615262 67 Margaret Vasquez mvasquez1u@tuttocitta.it Female 206.79.16.146 Poland 2/19/1973 281677.49 Quality Engineer
1454564143 68 Shawn Payne spayne1v@privacy.gov.au Male 233.32.138.222 6380689013620353 China 5/29/1996 152175.99 Help Desk Operator
1454560234 69 Bonnie Hart bhart1w@networkadvertising.org Female 92.158.145.51 5100141023990187 Philippines 8/10/1976 270525.27 Clinical Specialist
1454557523 70 Ruby Phillips rphillips1x@google.com.hk Female 180.71.236.34 Russia 12/29/1980 175991.04 Analog Circuit Design manager
1454615738 71 Michael Watkins mwatkins1y@infoseek.co.jp Male 20.48.165.57 6304600968704640 United States 277599.55
1454549243 72 Walter Hill whill1z@fda.gov Male 169.189.26.193 Philippines 4/25/1989 170789.26 Executive Secretary
1454590835 73 Deborah Garcia dgarcia20@ehow.com Female 176.149.163.227 3578754434491831 Brazil 213787.81 !@#$%^&*()
1454592567 74 Sandra Lee slee21@hatena.ne.jp Female 196.212.29.124 China 12/25/1976 190399.56 Assistant Media Planner ../../../../../../../../../../../etc/passwd%00
1454570808 75 Steve Shaw sshaw22@photobucket.com Male 56.32.41.109 3561652394394350 Macedonia 3/2/1961 180130.01 Recruiting Manager
1454627208 76 Jerry Hansen jhansen23@newyorker.com Male 180.99.147.201 36652106508977 Ukraine 4/27/1992 201900.61 Chief Design Engineer
1454595596 77 Joshua Harris jharris24@china.com.cn Male 93.173.2.87 3566428334927244 Greece 8/27/1987 189392.3 Account Representative III
1454615457 78 Clarence Simmons csimmons25@dailymotion.com Male 30.117.30.162 3571762129017388 Philippines 180434.25
1454604481 79 Denise Bishop dbishop26@wsj.com Female 251.230.214.155 3556286320706184 Philippines 10/18/1999 194426.62 Geologist II
1454614660 80 Jason Warren jwarren27@shop-pro.jp Male 197.52.56.75 4913424719275497 China 8/26/1998 92571.41 Accounting Assistant II
1454592347 81 Jesse Reynolds jreynolds28@amazon.com 46.11.66.226 Portugal 10/6/1977 \N Administrative Officer <img src=x onerror=alert(\'hi\') />
1454579746 82 Ruby Lynch rlynch29@xing.com Female 50.190.120.2 340177638737200 Portugal 5/7/1981 159634.3 Sales Associate
1454578991 83 Phillip Olson polson2a@marriott.com Male 38.205.137.200 4905640692662084 Indonesia 1/8/1987 161622.19 Assistant Media Planner
1454574785 84 Sean Watkins swatkins2b@ft.com Male 22.52.43.242 6759770945991352 China 2/7/1964 103943.54 Senior Financial Analyst
1454603364 85 Teresa Parker tparker2c@shinystat.com Female 36.134.254.22 4844522554899455 China 11/24/1987 137739.95 Chief Design Engineer
1454629483 86 Anthony Harris aharris2d@uiuc.edu Male 142.3.139.220 China 2/26/1975 194926.38 Senior Quality Engineer
1454617821 87 Donna Ray dray2e@wikimedia.org Female 122.113.90.100 3548062974262878 Peru 7/24/1964 121072.45 Clinical Specialist åß∂ƒ©˙∆˚¬…æ
1454567199 88 Craig Lewis clewis2f@purevolume.com Male 106.156.113.218 3535698276698452 Slovenia 113013.98
1454606687 89 Adam Turner aturner2g@delicious.com Male 94.92.15.85 3530109929436477 Sweden 3/18/1976 233715.21 Nurse Practicioner
1454565501 90 Terry Parker tparker2h@hc360.com Male 189.36.77.133 China 4/2/1987 232623.76 GIS Technical Architect
1454604198 91 Juan Shaw jshaw2i@ehow.com Male 222.127.83.190 493610712595084582 Democratic Republic of the Congo 220779.8
1454592729 92 Nicole Russell nrussell2j@angelfire.com Female 247.123.224.36 4120730296866808 Germany 90748.17
1454563310 93 Robin Ray rray2k@t.co Female 217.150.228.185 Sweden 9/28/1968 175995.93 Human Resources Assistant III """"
1454546406 94 Debra Sims dsims2l@meetup.com Female 150.198.93.159 5602215295621929 Brazil 12/21/1984 276704.96 Office Assistant IV
1454550946 95 Teresa Harrison tharrison2m@t.co Female 111.107.40.16 5007666196554596 Philippines 5/12/1959 129967.9 GIS Technical Architect
1454603302 96 Tammy Ward tward2n@51.la Female 148.119.68.255 3568303818489466 France 8/20/1984 63550.31 General Manager
1454605950 97 Louis Harrison lharrison2o@usgs.gov Male 134.95.151.68 5100179516595931 Ukraine 9/27/1986 169379.73 Payment Adjustment Coordinator
1454579744 98 Charles Simpson csimpson2p@mashable.com Male 241.0.124.209 3562073915241617 Sweden 9/20/1956 116909.68 Biostatistician IV
1454584629 99 Maria Richards mrichards2q@rediff.com Female 108.13.82.54 Azerbaijan 1/23/1978 34000.68 Clinical Specialist 社會科學院語學研究所
1454622328 100 Diana Hall dhall2r@oaic.gov.au Female 6.215.107.104 3528227609255704 Russia 8/29/1996 221168.13 Assistant Professor
=== Try load data from userdata5.parquet
1454582047 1 Kelly Ortiz kortiz0@omniture.com Female 252.115.158.159 3537905681760845 Russia 4/23/1980 277302.99 Nurse
1454626441 2 Sharon Carroll scarroll1@disqus.com Female 29.217.252.62 56022458507191696 Indonesia 8/28/1992 209258.05 Recruiter åß∂ƒ©˙∆˚¬…æ
1454608790 3 Ruth Ross rross2@cbc.ca Female 220.224.80.32 3589642396435648 Benin 6/13/1994 18270.7 Design Engineer
1454601797 4 Kelly Meyer kmeyer3@cornell.edu Female 255.65.123.124 Philippines 1/6/1967 17485.27 Cost Accountant
1454584344 5 Irene Jordan ijordan4@pagesperso-orange.fr Female 162.57.23.136 3576848317807089 United States 1/4/1997 163979.38 Programmer Analyst III
1454547199 6 Irene Wells iwells5@fema.gov Female 85.5.67.113 Iran 74337.42
1454604109 7 Jessica Grant jgrant6@gov.uk Female 127.235.63.12 3536345996536989 Ecuador 1/27/1969 128665.86 Payment Adjustment Coordinator
1454549472 8 Norma Wright nwright7@prweb.com Female 81.219.156.187 63047796765720509 Indonesia 6/27/1997 68907.46 Office Assistant III
1454611735 9 Brandon Snyder bsnyder8@artisteer.com Male 102.118.191.191 490339322609872711 Malta 10/6/1981 71646.15 Physical Therapy Assistant
1454610256 10 Stephanie Reed sreed9@who.int Female 175.52.228.75 502081312903167845 Afghanistan 8/27/1957 137924.13 Recruiter test
1454565105 11 Jane Armstrong jarmstronga@state.gov 202.44.98.126 374283443294665 China 10/30/1991 \N Associate Professor
1454607247 12 Donna Coleman dcolemanb@upenn.edu Female 178.9.167.99 Vietnam 11/21/1957 93283.06 Librarian
1454567839 13 Samuel Butler sbutlerc@hp.com Male 129.114.220.80 3587725229492688 Colombia 9/12/1984 208303.6 Compensation Analyst
1454567413 14 Jerry Medina jmedinad@youtu.be Male 87.0.152.222 3579766249568578 Japan 8/30/1988 53502.26 Registered Nurse
1454603317 15 Samuel Lane slanee@i2i.jp Male 225.20.25.160 Canada 9/6/1983 142643.38 GIS Technical Architect ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙
1454630090 16 Kathy Rice kricef@independent.co.uk Female 4.200.99.226 6709951086431189768 Philippines 52614.1
1454575979 17 Adam Woods awoodsg@mapy.cz Male 229.247.245.218 3580417672766100 Indonesia 12/8/1987 284906.49 Payment Adjustment Coordinator
1454555573 18 Theresa Ellis tellish@nydailynews.com Female 39.249.101.160 Belarus 6/18/1966 35216.95 Sales Representative
1454555343 19 Christopher Brooks cbrooksi@intel.com Male 252.52.58.13 China 119492.57
1454544139 20 Debra White dwhitej@umn.edu Female 142.140.184.111 Indonesia 47859.54
1454559526 21 Alice Ward awardk@cafepress.com Female 14.157.183.41 3554057857533990 Vietnam 5/7/1977 117790.3 Technical Writer
1454597106 22 Tina Wood twoodl@businesswire.com Female 201.242.103.145 3568980472135848 Sweden 3/28/1969 47283.17 Staff Scientist
1454591306 23 Carolyn Mendoza cmendozam@army.mil 214.205.231.22 Greece \N ␡
1454611603 24 Craig Ford cfordn@vistaprint.com Male 236.178.217.229 633110713949459104 Indonesia 12/22/1996 274187.59 Dental Hygienist
1454618551 25 Christine Morrison cmorrisono@ask.com Female 219.71.212.187 3538407669945679 Tanzania 3/12/1991 84756.66 Executive Secretary 社會科學院語學研究所
1454580024 26 Janice Dean jdeanp@statcounter.com Female 49.234.145.208 3537160378882698 Ukraine 8/21/1991 217443.08 Administrative Assistant III
1454558127 27 Joan Burton jburtonq@oaic.gov.au Female 221.227.41.244 201770241278691 China 4/6/1993 256763.22 Staff Accountant I \N
1454619460 28 Brandon Stone bstoner@discovery.com Male 1.106.6.30 30535344906416 Indonesia 7/13/1964 166396.41 Health Coach II
1454571966 29 Sarah Hall shalls@loc.gov Female 235.168.89.65 3528746985103311 Czech Republic 11/13/1959 123411.44 Assistant Manager
1454569447 30 Kelly Crawford kcrawfordt@typepad.com Female 152.220.24.54 3578225435679583 Poland 10/21/1970 115305.8 Chief Design Engineer
1454609438 31 Maria Banks mbanksu@google.co.uk Female 107.120.193.133 5602224764294077 Italy 10/29/1981 213273.21 Financial Analyst
1454546937 32 Roy Simmons rsimmonsv@telegraph.co.uk Male 21.20.158.183 5602244835346375 Mongolia 6/27/1994 13987.6 Senior Editor "<>?:""{}|_+"
1454611880 33 Judith Williamson jwilliamsonw@hubpages.com Female 128.75.193.80 3540423032294659 Indonesia 10/19/1975 35326.68 Senior Sales Associate
1454567714 34 Joe Arnold jarnoldx@soundcloud.com Male 170.118.207.254 4017955870878 Morocco 1/11/1991 261893.92 Mechanical Systems Engineer
1454605829 35 Richard Griffin rgriffiny@barnesandnoble.com Male 180.74.211.58 3539729371124817 Philippines 8/23/1964 43742.89 Nurse
1454607440 36 Billy Freeman bfreemanz@fda.gov Male 223.238.104.92 Sweden 5/19/1961 185185.85 Office Assistant I
1454601803 37 Shawn Welch swelch10@oaic.gov.au Male 239.144.169.67 Brazil 45785.65 test
1454626608 38 Kenneth Price kprice11@tamu.edu Male 121.107.99.253 372301962802254 China 3/1/1958 110448 Senior Sales Associate
1454612578 39 Patricia Lawson plawson12@dailymotion.com Female 181.201.209.42 6761282787969476 Czech Republic 4/6/1956 126454.68 Staff Accountant I
1454544201 40 Christine Alexander calexander13@aboutads.info Female 163.32.3.92 50183677518131890 China 1/14/1981 213713.99 Sales Associate
1454599667 41 Mark Wagner mwagner14@imageshack.us Male 78.141.201.64 5007660710388524 China 3/10/1987 207149.01 Staff Scientist
1454624139 42 Richard Armstrong rarmstrong15@baidu.com Male 229.173.184.111 3546008978147005 Indonesia 9/6/1961 52279.16 Software Engineer II
1454618327 43 Phillip Ellis pellis16@berkeley.edu Male 183.182.90.8 3561054399919267 Brazil 1/31/1994 59681.04 Analog Circuit Design manager \N
1454614376 44 Beverly Perry bperry17@nasa.gov Female 47.117.191.34 Vietnam 9/15/1983 41351.4 Database Administrator IV 1E+02
1454559810 45 Carolyn Parker cparker18@soup.io Female 124.227.162.209 3555739550936724 Belarus 1/29/1988 162142.52 Chemical Engineer
1454605899 46 Martin Knight mknight19@umn.edu Male 173.169.240.26 5387225346178705 China 9/4/1994 200217.98 Assistant Professor
1454580952 47 Michael Stephens mstephens1a@altervista.org Male 181.48.175.67 Honduras 9/10/1958 248987 Environmental Specialist
1454545483 48 Frances Willis fwillis1b@linkedin.com 102.186.57.75 4175001067968122 Philippines 8/3/1998 \N VP Marketing
1454618611 49 Gary Fox gfox1c@paginegialle.it Male 80.221.129.42 Belgium 261175.89
1454605416 50 Cynthia Bailey cbailey1d@microsoft.com Female 210.74.99.47 Indonesia 4/23/1989 38171.71 Sales Associate
1454547938 51 Terry Mitchell tmitchell1e@soundcloud.com Male 64.34.240.165 Peru 101626.65
1454607980 52 Edward Webb ewebb1f@123-reg.co.uk Male 208.114.99.74 6386981481832436 Jordan 235457.76
1454544152 53 Ralph Simmons rsimmons1g@google.cn Male 180.159.250.232 3554040768947822 Pakistan 111413.03
1454606074 54 Sara Kelly skelly1h@wix.com Female 97.243.219.196 3560161969850482 Portugal 12/11/1963 185788.86 Chief Design Engineer
1454577433 55 Donna Dean ddean1i@ftc.gov Female 91.232.196.181 Indonesia 285481.87
1454545198 56 Jane Murray jmurray1j@apache.org Female 174.82.82.71 5100149053428994 China 7/15/1973 57832.83 Software Consultant
1454582927 57 Walter Cook wcook1k@webnode.com Male 4.223.17.187 5048374925679138 China 7/19/1979 164010.7 Accounting Assistant IV
1454553504 58 Bonnie Hanson bhanson1l@squidoo.com Female 209.131.133.80 3546400025538536 China 8/6/1989 207065.08 Recruiter
1454583403 59 Patrick Kelly pkelly1m@usgs.gov Male 92.132.67.51 30129138653846 Poland 10/22/1984 281404.55 Librarian
1454551706 60 George Ross gross1n@sciencedaily.com Male 77.33.183.49 201938854334636 Portugal 2/17/1986 96243.17 Teacher
1454572199 61 Joan Harvey jharvey1o@biglobe.ne.jp Female 244.175.30.138 5479197462183554 Indonesia 12/30/1974 269498 Nurse Practicioner åß∂ƒ©˙∆˚¬…æ
1454555502 62 Louise Stone lstone1p@1und1.de Female 230.79.20.66 Indonesia 1/14/1980 44528.64 Senior Editor
1454597662 63 Lawrence Pierce lpierce1q@ihg.com Male 35.230.80.125 6763027632739915 Indonesia 7/22/1982 269467.08 Human Resources Assistant IV
1454577961 64 Dorothy Gray dgray1r@vimeo.com Female 206.99.76.117 3582462082297450 China 10/8/1975 58802.03 Staff Scientist -1.00
1454578138 65 Shawn Larson slarson1s@sohu.com Male 233.109.124.208 3557232712378033 Pakistan 6/11/1987 24566.92 Programmer I
1454620878 66 Ashley Carter acarter1t@weather.com Female 120.243.16.33 5641823823569006485 Philippines 2/4/1999 181594.54 Technical Writer
1454608592 67 Bruce Gonzalez bgonzalez1u@behance.net Male 213.165.12.93 5602219496203313 Sweden 6/27/1975 152915.03 Social Worker
1454570547 68 Gary Porter gporter1v@nhs.uk Male 113.26.17.148 3551504699131924 China 10/15/1988 239398.41 VP Sales åß∂ƒ©˙∆˚¬…æ
1454623375 69 Kimberly Bell kbell1w@techcrunch.com Female 232.188.203.114 06048433236353334 Tanzania 239482.42 "
1454580645 70 James Torres jtorres1x@rakuten.co.jp Male 42.70.136.181 Brazil 3/19/1968 66432.01 Information Systems Manager
1454565683 71 Cheryl Williams cwilliams1y@clickbank.net 24.11.168.130 Latvia 9/28/1958 \N Quality Control Specialist
1454572298 72 Diane Hicks dhicks1z@noaa.gov Female 220.185.241.90 36196827669213 Honduras 11/20/1977 104365.11 Systems Administrator I
1454630150 73 Judith Brown jbrown20@acquirethisname.com Female 173.62.110.176 Czech Republic 12/26/1994 218616.17 Safety Technician IV
1454550898 74 Jesse Dixon jdixon21@bloglines.com Male 156.125.120.208 Syria 277530.58 (╯°□°)╯︵ ┻━┻)
1454560223 75 Timothy Garza tgarza22@tmall.com Male 56.172.71.231 Poland 4/1/1978 21103.66 Desktop Support Technician ␡
1454549446 76 Gloria Washington gwashington23@hud.gov Female 249.63.88.116 3528613230855766 Portugal 10/17/1960 175586.21 Information Systems Manager
1454555260 77 Patricia Bell pbell24@youtu.be Female 20.46.164.228 3528267541114924 Honduras 1/31/1999 47750.6 Payment Adjustment Coordinator
1454579807 78 Theresa Clark tclark25@wp.com Female 178.250.150.112 6396247540156151 Indonesia 10/10/1989 78319.93 Executive Secretary
1454629649 79 Matthew Matthews mmatthews26@typepad.com Male 33.186.230.54 5213341713953768 Azerbaijan 10/4/1990 12883.34 Help Desk Technician
1454568333 80 Betty White bwhite27@github.com Female 128.110.102.181 3572999005932624 Morocco 12/6/1980 30998.69 Operator
1454559489 81 Christina Nguyen cnguyen28@washingtonpost.com Female 63.57.110.32 36954036240279 Philippines 7/23/1984 259707.25 Project Manager
1454575575 82 Norma Stevens nstevens29@newyorker.com Female 148.35.34.31 Brazil 7/24/1984 233848.07 Professor
1454547659 83 Tammy Walker twalker2a@craigslist.org Female 115.94.89.2 4508955158259501 China 1/1/1972 241046.96 Community Outreach Specialist
1454559813 84 Mark Jackson mjackson2b@utexas.edu Male 136.242.153.66 36666130651082 Philippines 12/9/1957 245352.11 Account Executive 部落格
1454547442 85 Scott Washington swashington2c@bloomberg.com Male 79.185.72.100 6395647151650882 Brazil 2/17/1957 240505.52 Professor
1454577775 86 Margaret Franklin mfranklin2d@mapy.cz Female 139.209.240.12 501835281527257384 Brazil 72758.49
1454582451 87 Carolyn Wilson cwilson2e@hp.com Female 5.172.62.195 3581164938009805 France 1/19/1997 162909.64 Librarian
1454608782 88 Emily Cole ecole2f@epa.gov 97.83.153.33 Burkina Faso 5/3/1996 \N Accounting Assistant IV 1.00
1454544809 89 Carolyn Gutierrez cgutierrez2g@smh.com.au Female 109.77.234.103 Madagascar 2/13/1999 139612.73 Nurse
1454591667 90 Jose Wallace jwallace2h@about.com Male 250.231.81.57 Philippines 12/17/1983 213500.16 Design Engineer
1454561119 91 Charles Reed creed2i@independent.co.uk Male 28.212.235.149 4017954848825528 China 88039.86
1454615732 92 Brian Parker bparker2j@hugedomains.com Male 143.67.111.179 Portugal 1/18/1996 202446.54 Executive Secretary
1454613613 93 Donald Fox dfox2k@webs.com Male 251.61.52.170 3553498748210516 Indonesia 12/19/1975 134745.75 Human Resources Manager
1454603200 94 Jack West jwest2l@biblegateway.com Male 115.144.142.60 Poland 10/30/1956 245162.49 Office Assistant I 1.00
1454574412 95 Doris Gomez dgomez2m@tinypic.com Female 156.173.76.213 4041593860679 Colombia 8/28/1977 164689.56 Speech Pathologist
1454544624 96 Brandon Owens bowens2n@si.edu Male 5.39.151.46 4591258400528650 France 3/13/1998 74028.68 Software Engineer III
1454596449 97 Evelyn Wagner ewagner2o@sbwire.com Female 84.231.120.250 3571837377153521 China 1/5/1965 78692.34 Operator
1454545547 98 Timothy Boyd tboyd2p@imdb.com Male 211.20.45.168 5602253132446507 Peru 7/8/1976 127883.56 Data Coordiator
1454549050 99 Edward Gilbert egilbert2q@ocn.ne.jp Male 237.183.200.242 3586807595028188 Bangladesh 8/30/1956 214872.75 Senior Financial Analyst
1454583513 100 Howard Patterson hpatterson2r@toplist.cz Male 200.77.150.4 3558592437934298 China 7/9/1991 23607 Administrative Assistant IV
=== Try load data from v0.7.1.all-named-index.parquet
0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2
0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1
0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2
0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2
0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1
0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2
0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1
0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1
0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1
0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2
=== Try load data from v0.7.1.column-metadata-handling.parquet
1 0.1 2017-01-01 02:00:00 a 2017-01-01 02:00:00
2 0.2 2017-01-02 02:00:00 b 2017-01-02 02:00:00
3 0.3 2017-01-03 02:00:00 c 2017-01-03 02:00:00
=== Try load data from v0.7.1.parquet
0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 0
0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 1
0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 2
0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 3
0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 4
0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 5
0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 6
0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 7
0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 8
0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 9
=== Try load data from v0.7.1.some-named-index.parquet
0.22 65.1 61 337 3.87 3.78 2.49 Fair E VS2
0.23 56.9 65 327 4.05 4.07 2.31 Good E VS1
0.31 63.3 58 335 4.34 4.35 2.75 Good J SI2
0.23 61.5 55 326 3.95 3.98 2.43 Ideal E SI2
0.21 59.8 61 326 3.89 3.84 2.31 Premium E SI1
0.29 62.4 58 334 4.2 4.23 2.63 Premium I VS2
0.26 61.9 55 337 4.07 4.11 2.53 Very Good H SI1
0.23 59.4 61 338 4 4.05 2.39 Very Good H VS1
0.24 62.3 57 336 3.95 3.98 2.47 Very Good I VVS1
0.24 62.8 57 336 3.94 3.96 2.48 Very Good J VVS2

View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
#
# Load all possible .parquet files found in submodules.
# TODO: Add more files.
#
# To regenerate data install perl JSON::XS module: sudo apt install libjson-xs-perl
# Also 5 sample files from
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata1.parquet
# ...
# wget https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata5.parquet
# set -x
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CUR_DIR/../shell_config.sh
CB_DIR=$(dirname "$CLICKHOUSE_CLIENT_BINARY")
[ "$CB_DIR" == "." ] && ROOT_DIR=$CUR_DIR/../../../..
[ "$CB_DIR" != "." ] && BUILD_DIR=$CB_DIR/../..
[ -z "$ROOT_DIR" ] && ROOT_DIR=$CB_DIR/../../..
DATA_DIR=$CUR_DIR/data_parquet
# To update:
# cp $ROOT_DIR/contrib/arrow/cpp/submodules/parquet-testing/data/*.parquet $ROOT_DIR/contrib/arrow/python/pyarrow/tests/data/parquet/*.parquet $CUR_DIR/data_parquet/
# BUG! nulls.snappy.parquet - parquet-reader shows wrong structure. Actual structure is {"type":"struct","fields":[{"name":"b_struct","type":{"type":"struct","fields":[{"name":"b_c_int","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}
# why? repeated_no_annotation.parquet
for NAME in `ls -1 $DATA_DIR/*.parquet | xargs -n 1 basename | sort`; do
echo === Try load data from $NAME
JSON=$DATA_DIR/$NAME.json
COLUMNS_FILE=$DATA_DIR/$NAME.columns
# If you want change or add .parquet file - rm data_parquet/*.json data_parquet/*.columns
[ -n "$BUILD_DIR" ] && [ ! -s $COLUMNS_FILE ] && [ ! -s $JSON ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader --json $DATA_DIR/$NAME > $JSON
[ -n "$BUILD_DIR" ] && [ ! -s $COLUMNS_FILE ] && $CUR_DIR/00900_parquet_create_table_columns.pl $JSON > $COLUMNS_FILE
# Debug only:
# [ -n "$BUILD_DIR" ] && $BUILD_DIR/contrib/arrow-cmake/parquet-reader $DATA_DIR/$NAME > $DATA_DIR/$NAME.dump
#COLUMNS=`$CUR_DIR/00900_parquet_create_table_columns.pl $JSON` 2>&1 || continue
COLUMNS=`cat $COLUMNS_FILE` || continue
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.parquet_load"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.parquet_load ($COLUMNS) ENGINE = Memory"
# Some files is broken, exception is ok.
cat $DATA_DIR/$NAME | ${CLICKHOUSE_CLIENT} --query="INSERT INTO test.parquet_load FORMAT Parquet" 2>&1 | sed 's/Exception/Ex---tion/'
${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.parquet_load LIMIT 100"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.parquet_load"
done

View File

@ -0,0 +1 @@
id Nullable(Int32), bool_col Nullable(UInt8), tinyint_col Nullable(Int32), smallint_col Nullable(Int32), int_col Nullable(Int32), bigint_col Nullable(Int64), float_col Nullable(Float32), double_col Nullable(Float64), date_string_col Nullable(String), string_col Nullable(String), timestamp_col Nullable(Int64)

View File

@ -0,0 +1 @@
id Nullable(Int32), bool_col Nullable(UInt8), tinyint_col Nullable(Int32), smallint_col Nullable(Int32), int_col Nullable(Int32), bigint_col Nullable(Int64), float_col Nullable(Float32), double_col Nullable(Float64), date_string_col Nullable(String), string_col Nullable(String), timestamp_col Nullable(Int64)

View File

@ -0,0 +1 @@
id Nullable(Int32), bool_col Nullable(UInt8), tinyint_col Nullable(Int32), smallint_col Nullable(Int32), int_col Nullable(Int32), bigint_col Nullable(Int64), float_col Nullable(Float32), double_col Nullable(Float64), date_string_col Nullable(String), string_col Nullable(String), timestamp_col Nullable(Int64)

View File

@ -0,0 +1 @@
value Nullable(Decimal128(1))

View File

@ -0,0 +1 @@
a Nullable(String), b Nullable(Int32), c Nullable(Float64), d Nullable(UInt8), element Nullable(Int32)

View File

@ -0,0 +1 @@
value Nullable(Decimal128(1))

View File

@ -0,0 +1 @@
value Nullable(Decimal128(1))

View File

@ -0,0 +1 @@
value Nullable(Decimal128(1))

View File

@ -0,0 +1 @@
value Nullable(Decimal128(1))

View File

@ -0,0 +1 @@
nation_key Nullable(Int32), name Nullable(String), region_key Nullable(Int32), comment_col Nullable(String)

View File

@ -0,0 +1 @@
element Nullable(String), b Nullable(Int32)

View File

@ -0,0 +1 @@
key Nullable(String), key1 Nullable(Int32), value Nullable(UInt8), b Nullable(Int32), c Nullable(Float64)

View File

@ -0,0 +1 @@
ID Nullable(Int64), element Nullable(Int32), element2 Nullable(Int32), key Nullable(String), value Nullable(Int32), key5 Nullable(String), value6 Nullable(Int32), a Nullable(Int32), element8 Nullable(Int32), e Nullable(Int32), f Nullable(String), key11 Nullable(String), element12 Nullable(Float64)

View File

@ -0,0 +1 @@
id Nullable(Int64), element Nullable(Int32), element2 Nullable(Int32), key Nullable(String), value Nullable(Int32), key5 Nullable(String), value6 Nullable(Int32), A Nullable(Int32), element8 Nullable(Int32), E Nullable(Int32), F Nullable(String), key11 Nullable(String), element12 Nullable(Float64)

View File

@ -0,0 +1 @@
b_c_int Nullable(Int32)

View File

@ -0,0 +1 @@
id Nullable(Int32), number Nullable(Int64), kind Nullable(String)

View File

@ -0,0 +1 @@
registration_dttm Nullable(Int64), id Nullable(Int32), first_name Nullable(String), last_name Nullable(String), email Nullable(String), gender Nullable(String), ip_address Nullable(String), cc Nullable(String), country Nullable(String), birthdate Nullable(String), salary Nullable(Float64), title Nullable(String), comments Nullable(String)

View File

@ -0,0 +1 @@
registration_dttm Nullable(Int64), id Nullable(Int32), first_name Nullable(String), last_name Nullable(String), email Nullable(String), gender Nullable(String), ip_address Nullable(String), cc Nullable(String), country Nullable(String), birthdate Nullable(String), salary Nullable(Float64), title Nullable(String), comments Nullable(String)

View File

@ -0,0 +1 @@
registration_dttm Nullable(Int64), id Nullable(Int32), first_name Nullable(String), last_name Nullable(String), email Nullable(String), gender Nullable(String), ip_address Nullable(String), cc Nullable(String), country Nullable(String), birthdate Nullable(String), salary Nullable(Float64), title Nullable(String), comments Nullable(String)

View File

@ -0,0 +1 @@
registration_dttm Nullable(Int64), id Nullable(Int32), first_name Nullable(String), last_name Nullable(String), email Nullable(String), gender Nullable(String), ip_address Nullable(String), cc Nullable(String), country Nullable(String), birthdate Nullable(String), salary Nullable(Float64), title Nullable(String), comments Nullable(String)

View File

@ -0,0 +1 @@
registration_dttm Nullable(Int64), id Nullable(Int32), first_name Nullable(String), last_name Nullable(String), email Nullable(String), gender Nullable(String), ip_address Nullable(String), cc Nullable(String), country Nullable(String), birthdate Nullable(String), salary Nullable(Float64), title Nullable(String), comments Nullable(String)

View File

@ -0,0 +1 @@
carat Nullable(Float64), depth Nullable(Float64), table Nullable(Float64), price Nullable(Int64), x Nullable(Float64), y Nullable(Float64), z Nullable(Float64), cut Nullable(String), color Nullable(String), clarity Nullable(String)

View File

@ -0,0 +1 @@
a Nullable(Int64), b Nullable(Float64), c Nullable(DateTime), index Nullable(String), __index_level_1__ Nullable(DateTime)

View File

@ -0,0 +1 @@
carat Nullable(Float64), cut Nullable(String), color Nullable(String), clarity Nullable(String), depth Nullable(Float64), table Nullable(Float64), price Nullable(Int64), x Nullable(Float64), y Nullable(Float64), z Nullable(Float64), __index_level_0__ Nullable(Int64)

View File

@ -0,0 +1 @@
carat Nullable(Float64), depth Nullable(Float64), table Nullable(Float64), price Nullable(Int64), x Nullable(Float64), y Nullable(Float64), z Nullable(Float64), cut Nullable(String), __index_level_1__ Nullable(String), clarity Nullable(String)

Some files were not shown because too many files have changed in this diff Show More