From 7320447f9285e64aa694a13253139912a78a0dc2 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Wed, 8 Jan 2020 03:13:12 -0600 Subject: [PATCH 01/89] Add Avro formats Add Avro file input/output formats Add AvroConfluent input format (for Kafka) --- .gitmodules | 6 +- CMakeLists.txt | 1 + cmake/find/avro.cmake | 43 ++ cmake/find/boost.cmake | 3 +- cmake/sanitize.cmake | 2 + contrib/CMakeLists.txt | 16 + contrib/avro | 1 + contrib/avro-cmake/CMakeLists.txt | 76 +++ contrib/boost | 2 +- contrib/boost-cmake/CMakeLists.txt | 5 + dbms/CMakeLists.txt | 5 + dbms/src/Core/Settings.h | 1 + dbms/src/Formats/FormatFactory.cpp | 3 + dbms/src/Formats/FormatFactory.h | 2 + dbms/src/Formats/FormatSettings.h | 6 + dbms/src/Formats/config_formats.h.in | 1 + .../Formats/Impl/AvroRowInputFormat.cpp | 620 ++++++++++++++++++ .../Formats/Impl/AvroRowInputFormat.h | 70 ++ .../Formats/Impl/AvroRowOutputFormat.cpp | 326 +++++++++ .../Formats/Impl/AvroRowOutputFormat.h | 60 ++ 20 files changed, 1246 insertions(+), 3 deletions(-) create mode 100644 cmake/find/avro.cmake create mode 160000 contrib/avro create mode 100644 contrib/avro-cmake/CMakeLists.txt create mode 100644 dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp create mode 100644 dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h create mode 100644 dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp create mode 100644 dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h diff --git a/.gitmodules b/.gitmodules index a35e4ba36e0..8147eb31799 100644 --- a/.gitmodules +++ b/.gitmodules @@ -46,7 +46,7 @@ url = https://github.com/ClickHouse-Extras/protobuf.git [submodule "contrib/boost"] path = contrib/boost - url = https://github.com/ClickHouse-Extras/boost.git + url = https://github.com/oandrew/clickhouse-boost [submodule "contrib/base64"] path = contrib/base64 url = https://github.com/aklomp/base64.git @@ -137,3 +137,7 @@ [submodule "contrib/ryu"] path = contrib/ryu url = https://github.com/ClickHouse-Extras/ryu.git +[submodule "contrib/avro"] + path = contrib/avro + url = https://github.com/apache/avro.git + ignore = untracked diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c32baa569d..949879cd29a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,6 +351,7 @@ include (cmake/find/simdjson.cmake) include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/orc.cmake) +include (cmake/find/avro.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) diff --git a/cmake/find/avro.cmake b/cmake/find/avro.cmake new file mode 100644 index 00000000000..7eb5c187cf8 --- /dev/null +++ b/cmake/find/avro.cmake @@ -0,0 +1,43 @@ +option (ENABLE_AVRO "Enable Avro" ${ENABLE_LIBRARIES}) + +if (ENABLE_AVRO) + +option (USE_INTERNAL_AVRO_LIBRARY "Set to FALSE to use system avro library instead of bundled" ${NOT_UNBUNDLED}) + +if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/CMakeLists.txt") + if(USE_INTERNAL_AVRO_LIBRARY) + message(WARNING "submodule contrib/avro is missing. to fix try run: \n git submodule update --init --recursive") + endif() + set(MISSING_INTERNAL_AVRO_LIBRARY 1) + set(USE_INTERNAL_AVRO_LIBRARY 0) +endif() + +if (NOT USE_INTERNAL_AVRO_LIBRARY) + find_package(Snappy REQUIRED) + find_library(AVROCPP avrocpp) +elseif(NOT MISSING_INTERNAL_AVRO_LIBRARY) + include(cmake/find/snappy.cmake) + add_subdirectory(contrib/avro-cmake) + set(AVROCPP_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/include") + set(AVROCPP_LIBRARY avrocpp_s) +endif () + +if (AVROCPP_LIBRARY AND AVROCPP_INCLUDE_DIR) + set(USE_AVRO 1) +endif() + + +# if (AVROCPP_LIBRARY AND AVROCPP_INCLUDE_DIR) +# set(USE_AVROCPP 1) +# elseif (Boost_INCLUDE_DIRS AND SNAPPY_LIBRARY) +# set(AVROCPP_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/include") +# set(AVROCPP_LIBRARY avrocpp_s) +# set(USE_AVROCPP 1) +# else() +# set(USE_INTERNAL_AVROCPP_LIBRARY 0) +# message(STATUS "avro deps: ${Boost_INCLUDE_DIRS}; ${SNAPPY_LIBRARY}; ${ZLIB_LIBRARY}") +# endif() + +endif() + +message (STATUS "Using avro=${USE_AVRO}: ${AVROCPP_LIBRARY} ${AVROCPP_INCLUDE_DIR}") diff --git a/cmake/find/boost.cmake b/cmake/find/boost.cmake index 6776d0cea06..ec10a34d839 100644 --- a/cmake/find/boost.cmake +++ b/cmake/find/boost.cmake @@ -31,6 +31,7 @@ if (NOT Boost_SYSTEM_LIBRARY AND NOT MISSING_INTERNAL_BOOST_LIBRARY) set (Boost_SYSTEM_LIBRARY boost_system_internal) set (Boost_PROGRAM_OPTIONS_LIBRARY boost_program_options_internal) set (Boost_FILESYSTEM_LIBRARY boost_filesystem_internal ${Boost_SYSTEM_LIBRARY}) + set (Boost_IOSTREAMS_LIBRARY boost_iostreams_internal) set (Boost_REGEX_LIBRARY boost_regex_internal) set (Boost_INCLUDE_DIRS) @@ -48,4 +49,4 @@ if (NOT Boost_SYSTEM_LIBRARY AND NOT MISSING_INTERNAL_BOOST_LIBRARY) list (APPEND Boost_INCLUDE_DIRS "${ClickHouse_SOURCE_DIR}/contrib/boost") endif () -message (STATUS "Using Boost: ${Boost_INCLUDE_DIRS} : ${Boost_PROGRAM_OPTIONS_LIBRARY},${Boost_SYSTEM_LIBRARY},${Boost_FILESYSTEM_LIBRARY},${Boost_REGEX_LIBRARY}") +message (STATUS "Using Boost: ${Boost_INCLUDE_DIRS} : ${Boost_PROGRAM_OPTIONS_LIBRARY},${Boost_SYSTEM_LIBRARY},${Boost_FILESYSTEM_LIBRARY},${Boost_IOSTREAMS_LIBRARY},${Boost_REGEX_LIBRARY}") diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index cb099ade7f5..089ec81691a 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -51,9 +51,11 @@ if (SANITIZE) set (ENABLE_READLINE 0 CACHE BOOL "") set (ENABLE_ORC 0 CACHE BOOL "") set (ENABLE_PARQUET 0 CACHE BOOL "") + set (ENABLE_AVRO 0 CACHE BOOL "") set (USE_CAPNP 0 CACHE BOOL "") set (USE_INTERNAL_ORC_LIBRARY 0 CACHE BOOL "") set (USE_ORC 0 CACHE BOOL "") + set (USE_AVRO 0 CACHE BOOL "") set (ENABLE_SSL 0 CACHE BOOL "") elseif (SANITIZE STREQUAL "thread") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 53ad9a0c138..f155940c32a 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -212,6 +212,22 @@ else() endif() endif() +if (USE_INTERNAL_AVRO_LIBRARY) + if(USE_INTERNAL_SNAPPY_LIBRARY) + set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "") + if (NOT MAKE_STATIC_LIBRARIES) + set(BUILD_SHARED_LIBS 1) # TODO: set at root dir + endif() + + add_subdirectory(snappy) + + set (SNAPPY_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/snappy") + if(SANITIZE STREQUAL "undefined") + target_compile_options(${SNAPPY_LIBRARY} PRIVATE -fno-sanitize=undefined) + endif() + endif() +endif() + if (USE_INTERNAL_POCO_LIBRARY) set (POCO_VERBOSE_MESSAGES 0 CACHE INTERNAL "") set (save_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) diff --git a/contrib/avro b/contrib/avro new file mode 160000 index 00000000000..89218262cde --- /dev/null +++ b/contrib/avro @@ -0,0 +1 @@ +Subproject commit 89218262cde62e98fcb3778b86cd3f03056c54f3 diff --git a/contrib/avro-cmake/CMakeLists.txt b/contrib/avro-cmake/CMakeLists.txt new file mode 100644 index 00000000000..643c68c54c5 --- /dev/null +++ b/contrib/avro-cmake/CMakeLists.txt @@ -0,0 +1,76 @@ +# project and source dir +set(AVROCPP_ROOT_DIR ${CMAKE_SOURCE_DIR}/contrib/avro/lang/c++) +set(AVROCPP_INCLUDE_DIR ${AVROCPP_ROOT_DIR}/api) +set(AVROCPP_SOURCE_DIR ${AVROCPP_ROOT_DIR}/impl) + +#set(AVROCPP_COMMON_DIR ${HDFS3_SOURCE_DIR}/common) + +if (EXISTS ${AVROCPP_ROOT_DIR}/../../share/VERSION.txt) + file(READ "${AVROCPP_ROOT_DIR}/../../share/VERSION.txt" + AVRO_VERSION) +endif() + +string(REPLACE "\n" "" AVRO_VERSION ${AVRO_VERSION}) +set (AVRO_VERSION_MAJOR ${AVRO_VERSION}) +set (AVRO_VERSION_MINOR "0") + +set (AVROCPP_SOURCE_FILES + ${AVROCPP_SOURCE_DIR}/Compiler.cc + ${AVROCPP_SOURCE_DIR}/Node.cc + ${AVROCPP_SOURCE_DIR}/LogicalType.cc + ${AVROCPP_SOURCE_DIR}/NodeImpl.cc + ${AVROCPP_SOURCE_DIR}/ResolverSchema.cc + ${AVROCPP_SOURCE_DIR}/Schema.cc + ${AVROCPP_SOURCE_DIR}/Types.cc + ${AVROCPP_SOURCE_DIR}/ValidSchema.cc + ${AVROCPP_SOURCE_DIR}/Zigzag.cc + ${AVROCPP_SOURCE_DIR}/BinaryEncoder.cc + ${AVROCPP_SOURCE_DIR}/BinaryDecoder.cc + ${AVROCPP_SOURCE_DIR}/Stream.cc + ${AVROCPP_SOURCE_DIR}/FileStream.cc + ${AVROCPP_SOURCE_DIR}/Generic.cc + ${AVROCPP_SOURCE_DIR}/GenericDatum.cc + ${AVROCPP_SOURCE_DIR}/DataFile.cc + ${AVROCPP_SOURCE_DIR}/parsing/Symbol.cc + ${AVROCPP_SOURCE_DIR}/parsing/ValidatingCodec.cc + ${AVROCPP_SOURCE_DIR}/parsing/JsonCodec.cc + ${AVROCPP_SOURCE_DIR}/parsing/ResolvingDecoder.cc + ${AVROCPP_SOURCE_DIR}/json/JsonIO.cc + ${AVROCPP_SOURCE_DIR}/json/JsonDom.cc + ${AVROCPP_SOURCE_DIR}/Resolver.cc + ${AVROCPP_SOURCE_DIR}/Validator.cc + ) + + +add_definitions(-std=c++17 -fPIC) + +add_library (avrocpp SHARED ${AVROCPP_SOURCE_FILES}) + +set_property (TARGET avrocpp + APPEND PROPERTY COMPILE_DEFINITIONS AVRO_DYN_LINK) + +add_library (avrocpp_s STATIC ${AVROCPP_SOURCE_FILES}) + +set_property (TARGET avrocpp avrocpp_s + APPEND PROPERTY COMPILE_DEFINITIONS AVRO_SOURCE) + +set_target_properties (avrocpp PROPERTIES + VERSION ${AVRO_VERSION_MAJOR}.${AVRO_VERSION_MINOR}) + +set_target_properties (avrocpp_s PROPERTIES + VERSION ${AVRO_VERSION_MAJOR}.${AVRO_VERSION_MINOR}) + +target_link_libraries (avrocpp ${Boost_IOSTREAMS_LIBRARY} ${SNAPPY_LIBRARY}) +target_link_libraries (avrocpp_s ${Boost_IOSTREAMS_LIBRARY} ${SNAPPY_LIBRARY}) + +target_compile_definitions (avrocpp PUBLIC SNAPPY_CODEC_AVAILABLE) +target_compile_definitions (avrocpp_s PUBLIC SNAPPY_CODEC_AVAILABLE) + +include_directories(${AVROCPP_INCLUDE_DIR}) +include_directories(${Boost_INCLUDE_DIRS}) +include_directories(${SNAPPY_INCLUDE_DIR}) + +ADD_CUSTOM_TARGET(symlink_headers ALL + COMMAND ${CMAKE_COMMAND} -E make_directory ${AVROCPP_ROOT_DIR}/include + COMMAND ${CMAKE_COMMAND} -E create_symlink ${AVROCPP_ROOT_DIR}/api ${AVROCPP_ROOT_DIR}/include/avro +) \ No newline at end of file diff --git a/contrib/boost b/contrib/boost index 830e51edb59..a2cfeb63eaf 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 830e51edb59c4f37a8638138581e1e56c29ac44f +Subproject commit a2cfeb63eaf3b32cf233105b1a40f4a5f26b8495 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index d9a8a70ef17..54dcd750320 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -37,3 +37,8 @@ target_link_libraries(boost_filesystem_internal PRIVATE boost_system_internal) if (USE_INTERNAL_PARQUET_LIBRARY) add_boost_lib(regex) endif() + +if (USE_INTERNAL_AVRO_LIBRARY) + add_boost_lib(iostreams) + target_link_libraries(boost_iostreams_internal PUBLIC ${ZLIB_LIBRARIES}) +endif() diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index e0c8b7da37a..4f9ca404cdf 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -483,6 +483,11 @@ if (USE_PARQUET) endif () endif () +if (USE_AVRO) + dbms_target_link_libraries(PRIVATE ${AVROCPP_LIBRARY}) + dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${AVROCPP_INCLUDE_DIR}) +endif () + if (OPENSSL_CRYPTO_LIBRARY) dbms_target_link_libraries (PRIVATE ${OPENSSL_CRYPTO_LIBRARY}) target_link_libraries (clickhouse_common_io PRIVATE ${OPENSSL_CRYPTO_LIBRARY}) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 724b31ca642..26684153832 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -186,6 +186,7 @@ struct Settings : public SettingsCollection M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \ M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \ M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \ + M(SettingString, input_format_avro_schema_registry_url, "", "For AvroConfluent format: Confluent Schema Registry URL.", 0) \ \ M(SettingBool, output_format_json_quote_64bit_integers, true, "Controls quoting of 64-bit integers in JSON output format.", 0) \ \ diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index 240e591123f..ade91c5a391 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -68,6 +68,7 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter; format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter; format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; + format_settings.avro.schema_registry_url = settings.input_format_avro_schema_registry_url; return format_settings; } @@ -325,6 +326,8 @@ FormatFactory::FormatFactory() registerInputFormatProcessorORC(*this); registerInputFormatProcessorParquet(*this); registerOutputFormatProcessorParquet(*this); + registerInputFormatProcessorAvro(*this); + registerOutputFormatProcessorAvro(*this); registerInputFormatProcessorTemplate(*this); registerOutputFormatProcessorTemplate(*this); diff --git a/dbms/src/Formats/FormatFactory.h b/dbms/src/Formats/FormatFactory.h index cbf64afeaec..345ceaee690 100644 --- a/dbms/src/Formats/FormatFactory.h +++ b/dbms/src/Formats/FormatFactory.h @@ -166,6 +166,8 @@ void registerInputFormatProcessorORC(FormatFactory & factory); void registerOutputFormatProcessorParquet(FormatFactory & factory); void registerInputFormatProcessorProtobuf(FormatFactory & factory); void registerOutputFormatProcessorProtobuf(FormatFactory & factory); +void registerInputFormatProcessorAvro(FormatFactory & factory); +void registerOutputFormatProcessorAvro(FormatFactory & factory); void registerInputFormatProcessorTemplate(FormatFactory & factory); void registerOutputFormatProcessorTemplate(FormatFactory &factory); diff --git a/dbms/src/Formats/FormatSettings.h b/dbms/src/Formats/FormatSettings.h index 6219edf6e6d..6ca54c12265 100644 --- a/dbms/src/Formats/FormatSettings.h +++ b/dbms/src/Formats/FormatSettings.h @@ -110,6 +110,12 @@ struct FormatSettings }; Custom custom; + + struct Avro + { + String schema_registry_url; + } avro; + }; } diff --git a/dbms/src/Formats/config_formats.h.in b/dbms/src/Formats/config_formats.h.in index 1ddd0e18aa9..308ded92b5d 100644 --- a/dbms/src/Formats/config_formats.h.in +++ b/dbms/src/Formats/config_formats.h.in @@ -2,6 +2,7 @@ // .h autogenerated by cmake! +#cmakedefine01 USE_AVRO #cmakedefine01 USE_CAPNP #cmakedefine01 USE_SNAPPY #cmakedefine01 USE_PARQUET diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp new file mode 100644 index 00000000000..eca22670a87 --- /dev/null +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -0,0 +1,620 @@ +#include "AvroRowInputFormat.h" +#if USE_AVRO + +#include + +#include +#include +#include +#include + + +#include + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_TYPE_OF_FIELD; + extern const int BAD_ARGUMENTS; + extern const int THERE_IS_NO_COLUMN; + extern const int LOGICAL_ERROR; + extern const int INCORRECT_DATA; + extern const int CANNOT_READ_ALL_DATA; + extern const int ILLEGAL_COLUMN; + extern const int TYPE_MISMATCH; +} + +class InputStreamReadBufferAdapter : public avro::InputStream +{ +public: + InputStreamReadBufferAdapter(ReadBuffer & in_) : in(in_) {} + + bool next(const uint8_t ** data, size_t * len) + { + if (in.eof()) + { + *len = 0; + return false; + } + + *data = (const uint8_t *)in.position(); + *len = in.available(); + + in.position() += in.available(); + return true; + } + + void backup(size_t len) { in.position() -= len; } + + void skip(size_t len) { in.tryIgnore(len); } + + size_t byteCount() const { return in.count(); } + +private: + ReadBuffer & in; +}; + +static void deserializeNoop(IColumn &, avro::Decoder &) +{ +} + +AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::NodePtr root_node, DataTypePtr target_type) +{ + auto logical_type = root_node->logicalType().type(); + WhichDataType target(target_type); + switch (root_node->type()) + { + case avro::AVRO_STRING: + if (target.isString()) + { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable { + decoder.decodeString(tmp); + column.insertData(tmp.c_str(), tmp.length()); + }; + } + case avro::AVRO_BYTES: + if (target.isString()) + { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable { + decoder.decodeString(tmp); + column.insertData(tmp.c_str(), tmp.length()); + }; + } + break; + case avro::AVRO_INT: + if (target.isInt32()) + { + return + [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeInt()); }; + } + if (target.isDate() && logical_type == avro::LogicalType::DATE) + { + return [](IColumn & column, avro::Decoder & decoder) { + assert_cast(column).insertValue(decoder.decodeInt()); + }; + } + break; + case avro::AVRO_LONG: + if (target.isInt64()) + { + return + [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeLong()); }; + } + if (target.isDateTime64()) + { + auto date_time_scale = assert_cast(*target_type).getScale(); + if ((logical_type == avro::LogicalType::TIMESTAMP_MILLIS && date_time_scale == 3) + || (logical_type == avro::LogicalType::TIMESTAMP_MICROS && date_time_scale == 6)) + { + return [](IColumn & column, avro::Decoder & decoder) { + assert_cast(column).insertValue(decoder.decodeLong()); + }; + } + } + break; + case avro::AVRO_FLOAT: + if (target.isFloat32()) + { + return [](IColumn & column, avro::Decoder & decoder) { + assert_cast(column).insertValue(decoder.decodeFloat()); + }; + } + break; + case avro::AVRO_DOUBLE: + if (target.isFloat64()) + { + return [](IColumn & column, avro::Decoder & decoder) { + assert_cast(column).insertValue(decoder.decodeDouble()); + }; + } + break; + case avro::AVRO_BOOL: + if (target.isUInt8()) + { + return + [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeBool()); }; + } + break; + case avro::AVRO_ARRAY: { + if (target.isArray()) + { + auto nested_source_type = root_node->leafAt(0); + auto nested_target_type = assert_cast(*target_type).getNestedType(); + auto nested_deserialize = createDeserializeFn(nested_source_type, nested_target_type); + return [nested_deserialize](IColumn & column, avro::Decoder & decoder) { + ColumnArray & column_array = assert_cast(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + IColumn & nested_column = column_array.getData(); + size_t total = 0; + for (size_t n = decoder.arrayStart(); n != 0; n = decoder.arrayNext()) + { + total += n; + for (size_t i = 0; i < n; i++) + { + nested_deserialize(nested_column, decoder); + } + } + offsets.push_back(offsets.back() + total); + }; + } + break; + } + case avro::AVRO_UNION: { + auto nullable_deserializer = [root_node, target_type](size_t non_null_union_index) { + auto nested_deserialize = createDeserializeFn(root_node->leafAt(non_null_union_index), removeNullable(target_type)); + return [non_null_union_index, nested_deserialize](IColumn & column, avro::Decoder & decoder) { + ColumnNullable & col = assert_cast(column); + size_t union_index = decoder.decodeUnionIndex(); + if (union_index == non_null_union_index) + { + nested_deserialize(col.getNestedColumn(), decoder); + col.getNullMapData().push_back(0); + } + else + { + col.insertDefault(); + } + }; + }; + if (root_node->leaves() == 2 && target.isNullable()) + { + if (root_node->leafAt(0)->type() == avro::AVRO_NULL) + return nullable_deserializer(1); + if (root_node->leafAt(1)->type() == avro::AVRO_NULL) + return nullable_deserializer(0); + } + break; + } + case avro::AVRO_NULL: + if (target.isNullable()) + { + auto nested_type = removeNullable(target_type); + if (nested_type->getTypeId() == TypeIndex::Nothing) + { + return [](IColumn & column, avro::Decoder & decoder) { + (void)column; + decoder.decodeNull(); + }; + } + else + { + return [](IColumn & column, avro::Decoder & decoder) { + ColumnNullable & col = assert_cast(column); + decoder.decodeNull(); + col.insertDefault(); + }; + } + } + break; + case avro::AVRO_ENUM: + if (target.isString()) + { + std::vector symbols; + for (size_t i = 0; i < root_node->names(); i++) + { + symbols.push_back(root_node->nameAt(i)); + } + return [symbols](IColumn & column, avro::Decoder & decoder) { + size_t enum_index = decoder.decodeEnum(); + const auto & enum_symbol = symbols[enum_index]; + column.insertData(enum_symbol.c_str(), enum_symbol.length()); + }; + } + if (target.isEnum()) + { + const auto & enum_type = assert_cast(*target_type); + std::vector symbol_mapping; + for (size_t i = 0; i < root_node->names(); i++) + { + symbol_mapping.push_back(enum_type.castToValue(root_node->nameAt(i))); + } + return [symbol_mapping](IColumn & column, avro::Decoder & decoder) { + size_t enum_index = decoder.decodeEnum(); + column.insert(symbol_mapping[enum_index]); + }; + } + break; + case avro::AVRO_FIXED: { + size_t fixed_size = root_node->fixedSize(); + if (target.isFixedString() && target_type->getSizeOfValueInMemory() == fixed_size) + { + return [tmp_fixed = std::vector(fixed_size)](IColumn & column, avro::Decoder & decoder) mutable { + decoder.decodeFixed(tmp_fixed.size(), tmp_fixed); + column.insertData(reinterpret_cast(tmp_fixed.data()), tmp_fixed.size()); + }; + } + break; + } + case avro::AVRO_MAP: + case avro::AVRO_RECORD: + default: + break; + } + + throw Exception( + "Type " + target_type->getName() + " is not compatible" + " with Avro " + avro::ValidSchema(root_node).toJson(false), + ErrorCodes::ILLEGAL_COLUMN); +} + +AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) +{ + switch (root_node->type()) + { + case avro::AVRO_STRING: + return [](avro::Decoder & decoder) { decoder.skipString(); }; + case avro::AVRO_BYTES: + return [](avro::Decoder & decoder) { decoder.skipBytes(); }; + case avro::AVRO_INT: + return [](avro::Decoder & decoder) { decoder.decodeInt(); }; + case avro::AVRO_LONG: + return [](avro::Decoder & decoder) { decoder.decodeLong(); }; + case avro::AVRO_FLOAT: + return [](avro::Decoder & decoder) { decoder.decodeFloat(); }; + case avro::AVRO_DOUBLE: + return [](avro::Decoder & decoder) { decoder.decodeDouble(); }; + case avro::AVRO_BOOL: + return [](avro::Decoder & decoder) { decoder.decodeBool(); }; + case avro::AVRO_ARRAY: { + auto nested_skip_fn = createSkipFn(root_node->leafAt(0)); + return [nested_skip_fn](avro::Decoder & decoder) { + for (size_t n = decoder.arrayStart(); n != 0; n = decoder.arrayNext()) + { + for (size_t i = 0; i < n; ++i) + { + nested_skip_fn(decoder); + } + } + }; + } + case avro::AVRO_UNION: { + std::vector union_skip_fns; + for (size_t i = 0; i < root_node->leaves(); i++) + { + union_skip_fns.push_back(createSkipFn(root_node->leafAt(i))); + } + return [union_skip_fns](avro::Decoder & decoder) { union_skip_fns[decoder.decodeUnionIndex()](decoder); }; + } + case avro::AVRO_NULL: + return [](avro::Decoder & decoder) { decoder.decodeNull(); }; + case avro::AVRO_ENUM: + return [](avro::Decoder & decoder) { decoder.decodeEnum(); }; + case avro::AVRO_FIXED: { + auto fixed_size = root_node->fixedSize(); + return [fixed_size](avro::Decoder & decoder) { decoder.skipFixed(fixed_size); }; + } + + case avro::AVRO_MAP: { + auto value_skip_fn = createSkipFn(root_node->leafAt(1)); + return [value_skip_fn](avro::Decoder & decoder) { + for (size_t n = decoder.mapStart(); n != 0; n = decoder.mapNext()) + { + for (size_t i = 0; i < n; ++i) + { + decoder.skipString(); + value_skip_fn(decoder); + } + } + }; + } + case avro::AVRO_RECORD: { + std::vector field_skip_fns; + for (size_t i = 0; i < root_node->leaves(); i++) + { + field_skip_fns.push_back(createSkipFn(root_node->leafAt(i))); + } + return [field_skip_fns](avro::Decoder & decoder) { + for (auto & skip_fn : field_skip_fns) + skip_fn(decoder); + }; + } + default: + throw Exception("Unsupported Avro type", ErrorCodes::ILLEGAL_COLUMN); + } +} + + +AvroDeserializer::AvroDeserializer(const DB::ColumnsWithTypeAndName & columns, avro::ValidSchema schema) +{ + auto schema_root = schema.root(); + if (schema_root->type() != avro::AVRO_RECORD) + { + throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); + } + field_mapping.resize(schema_root->leaves(), -1); + for (size_t i = 0; i < schema_root->leaves(); ++i) + { + skip_fns.push_back(createSkipFn(schema_root->leafAt(i))); + deserialize_fns.push_back(&deserializeNoop); + } + for (size_t i = 0; i < columns.size(); ++i) + { + const auto & column = columns[i]; + size_t field_index; + if (!schema_root->nameIndex(column.name, field_index)) + { + throw Exception("Field " + column.name + " not found in Avro schema", ErrorCodes::THERE_IS_NO_COLUMN); + } + auto field_schema = schema_root->leafAt(field_index); + try + { + deserialize_fns[field_index] = createDeserializeFn(field_schema, column.type); + } + catch (Exception & e) + { + e.addMessage("column " + column.name); + e.rethrow(); + } + field_mapping[field_index] = i; + } +} + +void AvroDeserializer::deserializeRow(MutableColumns & columns, avro::Decoder & decoder) +{ + for (size_t i = 0; i < field_mapping.size(); i++) + { + if (field_mapping[i] >= 0) + { + deserialize_fns[i](*columns[field_mapping[i]], decoder); + } + else + { + skip_fns[i](decoder); + } + } +} + + +AvroRowInputFormat::AvroRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) + : IRowInputFormat(header_, in_, params_) + , file_reader(std::make_unique(in_)) + , deserializer(header_.getColumnsWithTypeAndName(), file_reader.dataSchema()) +{ + file_reader.init(); +} + +bool AvroRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + if (file_reader.hasMore()) + { + file_reader.decr(); + deserializer.deserializeRow(columns, file_reader.decoder()); + return true; + } + return false; +} + + +class AvroConfluentRowInputFormat::SchemaRegistry +{ +public: + SchemaRegistry(const std::string & base_url_) + { + if (base_url_.empty()) + { + throw Exception("Empty Schema Registry URL", ErrorCodes::BAD_ARGUMENTS); + } + try + { + base_url = base_url_; + } + catch (Poco::SyntaxException & e) + { + throw Exception("Invalid Schema Registry URL", Exception(Exception::CreateFromPoco, e), ErrorCodes::BAD_ARGUMENTS); + } + } + + avro::ValidSchema getSchema(uint32_t id) + { + try + { + try + { + Poco::URI url(base_url, "/schemas/ids/" + std::to_string(id)); + Poco::Net::HTTPClientSession session(url.getHost(), url.getPort()); + Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery()); + session.sendRequest(request); + Poco::Net::HTTPResponse response; + auto & response_body = session.receiveResponse(response); + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK) + { + throw Exception("http code " + std::to_string(response.getStatus()), ErrorCodes::INCORRECT_DATA); + } + Poco::JSON::Parser parser; + auto json_body = parser.parse(response_body).extract(); + auto schema = json_body->getValue("schema"); + return avro::compileJsonSchemaFromString(schema); + } + catch (const Exception & e) + { + throw e; + } + catch (const Poco::Exception & e) + { + throw Exception(Exception::CreateFromPoco, e); + } + catch (const avro::Exception & e) + { + throw Exception(e.what(), ErrorCodes::INCORRECT_DATA); + } + } + catch (Exception & e) + { + e.addMessage("while fetching schema id=" + std::to_string(id)); + throw; + } + } + +private: + Poco::URI base_url; +}; + +static uint32_t readConfluentSchemaId(ReadBuffer & in) +{ + Poco::Buffer buf(5); + in.readStrict(buf.begin(), buf.capacity()); + Poco::MemoryBinaryReader binary_reader(buf, Poco::BinaryReader::BIG_ENDIAN_BYTE_ORDER); + + uint8_t magic; + uint32_t schema_id; + binary_reader >> magic >> schema_id; + if (magic != 0x00) + { + throw Exception("Invalid magic byte", ErrorCodes::INCORRECT_DATA); + } + + return schema_id; +} + +AvroConfluentRowInputFormat::AvroConfluentRowInputFormat( + const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_.cloneEmpty(), in_, params_) + , columns(header_.getColumnsWithTypeAndName()) + , schema_registry(std::make_unique(format_settings_.avro.schema_registry_url)) + , input_stream(std::make_unique(in)) + , decoder(avro::binaryDecoder()) + +{ + (void)format_settings_; + decoder->init(*input_stream); +} + +bool AvroConfluentRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + if (in.eof()) + { + return false; + } + SchemaId schema_id = readConfluentSchemaId(in); + auto & deserializer = getOrCreateDeserializer(schema_id); + deserializer.deserializeRow(columns, *decoder); + decoder->drain(); + return true; +} + +AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId schema_id) +{ + auto it = deserializer_cache.find(schema_id); + if (it == deserializer_cache.end()) + { + auto schema = schema_registry->getSchema(schema_id); + AvroDeserializer deserializer(columns, schema); + it = deserializer_cache.emplace(schema_id, deserializer).first; + } + return it->second; +} + +void registerInputFormatProcessorAvro(FormatFactory & factory) +{ + factory.registerInputFormatProcessor( + "Avro", + [=](ReadBuffer & buf, + const Block & sample, + const Context & context, + const RowInputFormatParams & params, + const FormatSettings & settings) { + (void)(params); + (void)(settings); + (void)(context); + return std::make_shared(sample, buf, params); + }); + + factory.registerInputFormatProcessor( + "AvroConfluent", + [=](ReadBuffer & buf, + const Block & sample, + const Context & context, + const RowInputFormatParams & params, + const FormatSettings & settings) { + (void)(params); + (void)(settings); + (void)(context); + return std::make_shared(sample, buf, params, settings); + }); +} + +} + +#else + +namespace DB +{ +class FormatFactory; +void registerInputFormatProcessorAvro(FormatFactory &) +{ +} +} + +#endif diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h new file mode 100644 index 00000000000..65b46074d7e --- /dev/null +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -0,0 +1,70 @@ +#pragma once +#include "config_formats.h" +#if USE_AVRO + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ +class AvroDeserializer +{ +public: + AvroDeserializer(const DB::ColumnsWithTypeAndName & columns, avro::ValidSchema schema); + void deserializeRow(MutableColumns & columns, avro::Decoder & decoder); + +private: + using DeserializeFn = std::function; + using SkipFn = std::function; + static DeserializeFn createDeserializeFn(avro::NodePtr root_node, DataTypePtr target_type); + static SkipFn createSkipFn(avro::NodePtr root_node); + + std::vector field_mapping; + std::vector skip_fns; + std::vector deserialize_fns; +}; + +class AvroRowInputFormat : public IRowInputFormat +{ +public: + AvroRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); + virtual bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + String getName() const override { return "AvroRowInputFormat"; } + +private: + avro::DataFileReaderBase file_reader; + AvroDeserializer deserializer; +}; + +class AvroConfluentRowInputFormat : public IRowInputFormat +{ +public: + AvroConfluentRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_); + virtual bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + String getName() const override { return "AvroConfluentRowInputFormat"; } + +private: + const DB::ColumnsWithTypeAndName columns; + + class SchemaRegistry; + std::unique_ptr schema_registry; + + using SchemaId = uint32_t; + std::unordered_map deserializer_cache; + AvroDeserializer & getOrCreateDeserializer(SchemaId schema_id); + + avro::InputStreamPtr input_stream; + avro::DecoderPtr decoder; +}; +} +#endif diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp new file mode 100644 index 00000000000..0fd40a6e36c --- /dev/null +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -0,0 +1,326 @@ +#include "AvroRowOutputFormat.h" +#if USE_AVRO + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_TYPE_OF_FIELD; + extern const int BAD_ARGUMENTS; + extern const int THERE_IS_NO_COLUMN; + extern const int LOGICAL_ERROR; + extern const int INCORRECT_DATA; + extern const int CANNOT_READ_ALL_DATA; +} + +class OutputStreamWriteBufferAdapter : public avro::OutputStream +{ +public: + OutputStreamWriteBufferAdapter(WriteBuffer & out_) : out(out_) {} + + virtual bool next(uint8_t ** data, size_t * len) override + { + out.nextIfAtEnd(); + *data = (uint8_t *)out.position(); + *len = out.available(); + out.position() += out.available(); + + return true; + } + + virtual void backup(size_t len) override { out.position() -= len; } + + virtual uint64_t byteCount() const override { return out.count(); } + virtual void flush() override { out.next(); } + +private: + WriteBuffer & out; +}; + + +AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeFn(DataTypePtr data_type) +{ + switch (data_type->getTypeId()) + { + case TypeIndex::UInt8: + return {avro::BoolSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + encoder.encodeBool(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Int32: + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Int64: + return {avro::LongSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + encoder.encodeLong(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Float32: + return {avro::FloatSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + encoder.encodeFloat(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Float64: + return {avro::DoubleSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + encoder.encodeDouble(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Date: { + auto schema = avro::IntSchema(); + schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::DATE)); + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + UInt16 date = assert_cast(column).getElement(row_num); + encoder.encodeInt(date); + }}; + } + case TypeIndex::DateTime: + throw Exception("Unsupported Avro type", ErrorCodes::BAD_TYPE_OF_FIELD); + case TypeIndex::DateTime64: { + auto schema = avro::LongSchema(); + const auto & provided_type = assert_cast(*data_type); + + if (provided_type.getScale() == 3) + schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::TIMESTAMP_MILLIS)); + else if (provided_type.getScale() == 6) + schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::TIMESTAMP_MICROS)); + else + throw Exception("Unsupported Avro type", ErrorCodes::BAD_TYPE_OF_FIELD); + + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const auto & col = assert_cast(column); + encoder.encodeLong(col.getElement(row_num)); + }}; + } + case TypeIndex::String: + return {avro::StringSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const StringRef & s = assert_cast(column).getDataAt(row_num); + encoder.encodeBytes(reinterpret_cast(s.data), s.size); + }}; + case TypeIndex::FixedString: { + return {avro::FixedSchema(data_type->getSizeOfValueInMemory(), "fixed"), + [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const StringRef & s = assert_cast(column).getDataAt(row_num); + encoder.encodeFixed(reinterpret_cast(s.data), s.size); + }}; + } + case TypeIndex::Enum8: { + auto schema = avro::EnumSchema("enum"); + std::unordered_map enum_mapping; + const auto & enum_values = assert_cast(*data_type).getValues(); + for (size_t i = 0; i < enum_values.size(); ++i) + { + schema.addSymbol(enum_values[i].first); + enum_mapping.emplace(enum_values[i].second, i); + } + return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + auto enum_value = assert_cast(column).getElement(row_num); + encoder.encodeEnum(enum_mapping.at(enum_value)); + }}; + } + case TypeIndex::Enum16: { + auto schema = avro::EnumSchema("enum"); + std::unordered_map enum_mapping; + const auto & enum_values = assert_cast(*data_type).getValues(); + for (size_t i = 0; i < enum_values.size(); ++i) + { + schema.addSymbol(enum_values[i].first); + enum_mapping.emplace(enum_values[i].second, i); + } + return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + auto enum_value = assert_cast(column).getElement(row_num); + encoder.encodeEnum(enum_mapping.at(enum_value)); + }}; + } + case TypeIndex::Array: { + const auto & array_type = assert_cast(*data_type); + auto nested_mapping = createSchemaWithSerializeFn(array_type.getNestedType()); + return {avro::ArraySchema(nested_mapping.schema), + [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + size_t row_count = next_offset - offset; + const IColumn & nested_column = column_array.getData(); + + encoder.arrayStart(); + if (row_count > 0) + { + encoder.setItemCount(row_count); + } + for (size_t i = offset; i < next_offset; ++i) + { + nested_mapping.serialize(nested_column, i, encoder); + } + encoder.arrayEnd(); + }}; + } + case TypeIndex::Nullable: { + auto nested_type = removeNullable(data_type); + auto nested_mapping = createSchemaWithSerializeFn(nested_type); + if (nested_type->getTypeId() == TypeIndex::Nothing) + { + return nested_mapping; + } + else + { + avro::UnionSchema union_schema; + union_schema.addType(avro::NullSchema()); + union_schema.addType(nested_mapping.schema); + return {union_schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const ColumnNullable & col = assert_cast(column); + if (!col.isNullAt(row_num)) + { + encoder.encodeUnionIndex(1); + nested_mapping.serialize(col.getNestedColumn(), row_num, encoder); + } + else + { + encoder.encodeUnionIndex(0); + encoder.encodeNull(); + } + }}; + } + } + case TypeIndex::LowCardinality: { + const auto & nested_type = removeLowCardinality(data_type); + auto nested_mapping = createSchemaWithSerializeFn(nested_type); + return {nested_mapping.schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { + const auto & col = assert_cast(column); + nested_mapping.serialize(*col.getDictionary().getNestedColumn(), col.getIndexAt(row_num), encoder); + }}; + } + case TypeIndex::Nothing: + return {avro::NullSchema(), [](const IColumn &, size_t, avro::Encoder & encoder) { encoder.encodeNull(); }}; + default: + break; + } + throw Exception("Type " + data_type->getName() + " is not supported for Avro output", ErrorCodes::ILLEGAL_COLUMN); +} + + +AvroSerializer::AvroSerializer(const ColumnsWithTypeAndName & columns) +{ + avro::RecordSchema record_schema("row"); + for (auto & column : columns) + { + try + { + auto field_mapping = createSchemaWithSerializeFn(column.type); + serialize_fns.push_back(field_mapping.serialize); + //TODO: verify name starts with A-Za-z_ + record_schema.addField(column.name, field_mapping.schema); + } + catch (Exception & e) + { + e.addMessage("column " + column.name); + e.rethrow(); + } + } + schema.setSchema(record_schema); +} + +void AvroSerializer::serializeRow(const Columns & columns, size_t row_num, avro::Encoder & encoder) +{ + size_t num_columns = columns.size(); + for (size_t i = 0; i < num_columns; ++i) + { + serialize_fns[i](*columns[i], row_num, encoder); + } +} + +AvroRowOutputFormat::AvroRowOutputFormat( + WriteBuffer & out_, const Block & header_, FormatFactory::WriteCallback callback, const FormatSettings & settings_) + : IRowOutputFormat(header_, out_, callback) + , settings(settings_) + , serializer(header_.getColumnsWithTypeAndName()) + , file_writer(std::make_unique(out_), serializer.getSchema(), 16 * 1024, avro::Codec::SNAPPY_CODEC) +{ +} + +AvroRowOutputFormat::~AvroRowOutputFormat() = default; + +void AvroRowOutputFormat::writePrefix() +{ + file_writer.syncIfNeeded(); +} + +void AvroRowOutputFormat::write(const Columns & columns, size_t row_num) +{ + file_writer.syncIfNeeded(); + serializer.serializeRow(columns, row_num, file_writer.encoder()); + file_writer.incr(); +} + +void AvroRowOutputFormat::writeSuffix() +{ + file_writer.close(); +} + +void registerOutputFormatProcessorAvro(FormatFactory & factory) +{ + factory.registerOutputFormatProcessor( + "Avro", + [=](WriteBuffer & buf, + const Block & sample, + const Context & context, + FormatFactory::WriteCallback callback, + const FormatSettings & settings) { + (void)(context); + (void)(callback); + return std::make_shared(buf, sample, callback, settings); + }); +} + +} + +#else + +namespace DB +{ +class FormatFactory; +void registerOutputFormatProcessorAvro(FormatFactory &) +{ +} +} + +#endif diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h new file mode 100644 index 00000000000..efe63c1a72f --- /dev/null +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h @@ -0,0 +1,60 @@ +#pragma once +#include "config_formats.h" +#if USE_AVRO +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ +class WriteBuffer; + +class AvroSerializer +{ +public: + AvroSerializer(const ColumnsWithTypeAndName & columns); + const avro::ValidSchema & getSchema() const { return schema; } + void serializeRow(const Columns & columns, size_t row_num, avro::Encoder & encoder); + +private: + using SerializeFn = std::function; + struct SchemaWithSerializeFn + { + avro::Schema schema; + SerializeFn serialize; + }; + static SchemaWithSerializeFn createSchemaWithSerializeFn(DataTypePtr data_type); + + std::vector serialize_fns; + avro::ValidSchema schema; +}; + +class AvroRowOutputFormat : public IRowOutputFormat +{ +public: + AvroRowOutputFormat(WriteBuffer & out_, const Block & header_, FormatFactory::WriteCallback callback, const FormatSettings & settings_); + virtual ~AvroRowOutputFormat() override; + + String getName() const override { return "AvroRowOutputFormat"; } + void write(const Columns & columns, size_t row_num) override; + void writeField(const IColumn &, const IDataType &, size_t) override {} + virtual void writePrefix() override; + virtual void writeSuffix() override; + +private: + FormatSettings settings; + AvroSerializer serializer; + avro::DataFileWriterBase file_writer; +}; + +} +#endif From 38ab600a087ec688bdff26940f7dc5fc5e60338c Mon Sep 17 00:00:00 2001 From: oandrew Date: Wed, 8 Jan 2020 22:22:49 -0600 Subject: [PATCH 02/89] Fix compilation errors --- .../Processors/Formats/Impl/AvroRowInputFormat.cpp | 13 +++++-------- .../Processors/Formats/Impl/AvroRowInputFormat.h | 2 +- .../Processors/Formats/Impl/AvroRowOutputFormat.cpp | 4 +--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index eca22670a87..f7b34a076d3 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -90,7 +90,7 @@ public: return false; } - *data = (const uint8_t *)in.position(); + *data = reinterpret_cast(in.position()); *len = in.available(); in.position() += in.available(); @@ -125,6 +125,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node column.insertData(tmp.c_str(), tmp.length()); }; } + break; case avro::AVRO_BYTES: if (target.isString()) { @@ -472,7 +473,7 @@ public: } catch (Poco::SyntaxException & e) { - throw Exception("Invalid Schema Registry URL", Exception(Exception::CreateFromPoco, e), ErrorCodes::BAD_ARGUMENTS); + throw Exception("Invalid Schema Registry URL: " + e.displayText(), ErrorCodes::BAD_ARGUMENTS); } } @@ -541,7 +542,7 @@ static uint32_t readConfluentSchemaId(ReadBuffer & in) AvroConfluentRowInputFormat::AvroConfluentRowInputFormat( const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_) : IRowInputFormat(header_.cloneEmpty(), in_, params_) - , columns(header_.getColumnsWithTypeAndName()) + , header_columns(header_.getColumnsWithTypeAndName()) , schema_registry(std::make_unique(format_settings_.avro.schema_registry_url)) , input_stream(std::make_unique(in)) , decoder(avro::binaryDecoder()) @@ -570,7 +571,7 @@ AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId if (it == deserializer_cache.end()) { auto schema = schema_registry->getSchema(schema_id); - AvroDeserializer deserializer(columns, schema); + AvroDeserializer deserializer(header_columns, schema); it = deserializer_cache.emplace(schema_id, deserializer).first; } return it->second; @@ -582,12 +583,10 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) "Avro", [=](ReadBuffer & buf, const Block & sample, - const Context & context, const RowInputFormatParams & params, const FormatSettings & settings) { (void)(params); (void)(settings); - (void)(context); return std::make_shared(sample, buf, params); }); @@ -595,12 +594,10 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) "AvroConfluent", [=](ReadBuffer & buf, const Block & sample, - const Context & context, const RowInputFormatParams & params, const FormatSettings & settings) { (void)(params); (void)(settings); - (void)(context); return std::make_shared(sample, buf, params, settings); }); } diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h index 65b46074d7e..ef5e01973dd 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -54,7 +54,7 @@ public: String getName() const override { return "AvroConfluentRowInputFormat"; } private: - const DB::ColumnsWithTypeAndName columns; + const DB::ColumnsWithTypeAndName header_columns; class SchemaRegistry; std::unique_ptr schema_registry; diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 0fd40a6e36c..786626170fc 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -63,7 +63,7 @@ public: virtual bool next(uint8_t ** data, size_t * len) override { out.nextIfAtEnd(); - *data = (uint8_t *)out.position(); + *data = reinterpret_cast(out.position()); *len = out.available(); out.position() += out.available(); @@ -302,10 +302,8 @@ void registerOutputFormatProcessorAvro(FormatFactory & factory) "Avro", [=](WriteBuffer & buf, const Block & sample, - const Context & context, FormatFactory::WriteCallback callback, const FormatSettings & settings) { - (void)(context); (void)(callback); return std::make_shared(buf, sample, callback, settings); }); From e96b3059c04eecaa06ce4343424e8d3588d5d028 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Thu, 9 Jan 2020 23:08:29 -0600 Subject: [PATCH 03/89] Refactor Avro CMake --- cmake/find/avro.cmake | 21 ++------------ contrib/CMakeLists.txt | 42 ++++++++++----------------- contrib/avro-cmake/CMakeLists.txt | 48 ++++++++++++++----------------- contrib/boost | 2 +- 4 files changed, 40 insertions(+), 73 deletions(-) diff --git a/cmake/find/avro.cmake b/cmake/find/avro.cmake index 7eb5c187cf8..39ad2e31e54 100644 --- a/cmake/find/avro.cmake +++ b/cmake/find/avro.cmake @@ -2,7 +2,7 @@ option (ENABLE_AVRO "Enable Avro" ${ENABLE_LIBRARIES}) if (ENABLE_AVRO) -option (USE_INTERNAL_AVRO_LIBRARY "Set to FALSE to use system avro library instead of bundled" ${NOT_UNBUNDLED}) +option (USE_INTERNAL_AVRO_LIBRARY "Set to FALSE to use system avro library instead of bundled" ON) if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/CMakeLists.txt") if(USE_INTERNAL_AVRO_LIBRARY) @@ -13,31 +13,16 @@ if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/CMakeLists.txt") endif() if (NOT USE_INTERNAL_AVRO_LIBRARY) - find_package(Snappy REQUIRED) - find_library(AVROCPP avrocpp) elseif(NOT MISSING_INTERNAL_AVRO_LIBRARY) include(cmake/find/snappy.cmake) - add_subdirectory(contrib/avro-cmake) set(AVROCPP_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/include") - set(AVROCPP_LIBRARY avrocpp_s) + set(AVROCPP_LIBRARY avrocpp) endif () if (AVROCPP_LIBRARY AND AVROCPP_INCLUDE_DIR) set(USE_AVRO 1) endif() - -# if (AVROCPP_LIBRARY AND AVROCPP_INCLUDE_DIR) -# set(USE_AVROCPP 1) -# elseif (Boost_INCLUDE_DIRS AND SNAPPY_LIBRARY) -# set(AVROCPP_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/include") -# set(AVROCPP_LIBRARY avrocpp_s) -# set(USE_AVROCPP 1) -# else() -# set(USE_INTERNAL_AVROCPP_LIBRARY 0) -# message(STATUS "avro deps: ${Boost_INCLUDE_DIRS}; ${SNAPPY_LIBRARY}; ${ZLIB_LIBRARY}") -# endif() - endif() -message (STATUS "Using avro=${USE_AVRO}: ${AVROCPP_LIBRARY} ${AVROCPP_INCLUDE_DIR}") +message (STATUS "Using avro=${USE_AVRO}: ${AVROCPP_INCLUDE_DIR} : ${AVROCPP_LIBRARY}") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index f155940c32a..c8b8a7ed7a1 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -146,6 +146,20 @@ if (ENABLE_ICU AND USE_INTERNAL_ICU_LIBRARY) add_subdirectory (icu-cmake) endif () +if(USE_INTERNAL_SNAPPY_LIBRARY) + set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "") + if (NOT MAKE_STATIC_LIBRARIES) + set(BUILD_SHARED_LIBS 1) # TODO: set at root dir + endif() + + add_subdirectory(snappy) + + set (SNAPPY_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/snappy") + if(SANITIZE STREQUAL "undefined") + target_compile_options(${SNAPPY_LIBRARY} PRIVATE -fno-sanitize=undefined) + endif() +endif() + if (USE_INTERNAL_PARQUET_LIBRARY) if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) # We dont use arrow's cmakefiles because they uses too many depends and download some libs in compile time @@ -189,20 +203,6 @@ if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) endif() else() - if(USE_INTERNAL_SNAPPY_LIBRARY) - set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "") - if (NOT MAKE_STATIC_LIBRARIES) - set(BUILD_SHARED_LIBS 1) # TODO: set at root dir - endif() - - add_subdirectory(snappy) - - set (SNAPPY_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/snappy") - if(SANITIZE STREQUAL "undefined") - target_compile_options(${SNAPPY_LIBRARY} PRIVATE -fno-sanitize=undefined) - endif() - endif() - add_subdirectory(arrow-cmake) # The library is large - avoid bloat. @@ -213,19 +213,7 @@ endif() endif() if (USE_INTERNAL_AVRO_LIBRARY) - if(USE_INTERNAL_SNAPPY_LIBRARY) - set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "") - if (NOT MAKE_STATIC_LIBRARIES) - set(BUILD_SHARED_LIBS 1) # TODO: set at root dir - endif() - - add_subdirectory(snappy) - - set (SNAPPY_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/snappy") - if(SANITIZE STREQUAL "undefined") - target_compile_options(${SNAPPY_LIBRARY} PRIVATE -fno-sanitize=undefined) - endif() - endif() + add_subdirectory(avro-cmake) endif() if (USE_INTERNAL_POCO_LIBRARY) diff --git a/contrib/avro-cmake/CMakeLists.txt b/contrib/avro-cmake/CMakeLists.txt index 643c68c54c5..f544b3c50cd 100644 --- a/contrib/avro-cmake/CMakeLists.txt +++ b/contrib/avro-cmake/CMakeLists.txt @@ -1,9 +1,8 @@ -# project and source dir set(AVROCPP_ROOT_DIR ${CMAKE_SOURCE_DIR}/contrib/avro/lang/c++) set(AVROCPP_INCLUDE_DIR ${AVROCPP_ROOT_DIR}/api) set(AVROCPP_SOURCE_DIR ${AVROCPP_ROOT_DIR}/impl) -#set(AVROCPP_COMMON_DIR ${HDFS3_SOURCE_DIR}/common) +set (CMAKE_CXX_STANDARD 17) if (EXISTS ${AVROCPP_ROOT_DIR}/../../share/VERSION.txt) file(READ "${AVROCPP_ROOT_DIR}/../../share/VERSION.txt" @@ -41,36 +40,31 @@ set (AVROCPP_SOURCE_FILES ${AVROCPP_SOURCE_DIR}/Validator.cc ) +add_library (avrocpp ${AVROCPP_SOURCE_FILES}) +set_target_properties (avrocpp PROPERTIES VERSION ${AVRO_VERSION_MAJOR}.${AVRO_VERSION_MINOR}) -add_definitions(-std=c++17 -fPIC) +target_include_directories(avrocpp SYSTEM PUBLIC ${AVROCPP_INCLUDE_DIR}) -add_library (avrocpp SHARED ${AVROCPP_SOURCE_FILES}) +target_include_directories(avrocpp SYSTEM PUBLIC ${Boost_INCLUDE_DIRS}) +target_link_libraries (avrocpp ${Boost_IOSTREAMS_LIBRARY}) -set_property (TARGET avrocpp - APPEND PROPERTY COMPILE_DEFINITIONS AVRO_DYN_LINK) +if (SNAPPY_INCLUDE_DIR AND SNAPPY_LIBRARY) + target_compile_definitions (avrocpp PUBLIC SNAPPY_CODEC_AVAILABLE) + target_include_directories (avrocpp PRIVATE ${SNAPPY_INCLUDE_DIR}) + target_link_libraries (avrocpp ${SNAPPY_LIBRARY}) +endif () -add_library (avrocpp_s STATIC ${AVROCPP_SOURCE_FILES}) +if (COMPILER_GCC) + set (SUPPRESS_WARNINGS -Wno-non-virtual-dtor) +elseif (COMPILER_CLANG) + set (SUPPRESS_WARNINGS -Wno-non-virtual-dtor) +endif () -set_property (TARGET avrocpp avrocpp_s - APPEND PROPERTY COMPILE_DEFINITIONS AVRO_SOURCE) +target_compile_options(avrocpp PRIVATE ${SUPPRESS_WARNINGS}) -set_target_properties (avrocpp PROPERTIES - VERSION ${AVRO_VERSION_MAJOR}.${AVRO_VERSION_MINOR}) - -set_target_properties (avrocpp_s PROPERTIES - VERSION ${AVRO_VERSION_MAJOR}.${AVRO_VERSION_MINOR}) - -target_link_libraries (avrocpp ${Boost_IOSTREAMS_LIBRARY} ${SNAPPY_LIBRARY}) -target_link_libraries (avrocpp_s ${Boost_IOSTREAMS_LIBRARY} ${SNAPPY_LIBRARY}) - -target_compile_definitions (avrocpp PUBLIC SNAPPY_CODEC_AVAILABLE) -target_compile_definitions (avrocpp_s PUBLIC SNAPPY_CODEC_AVAILABLE) - -include_directories(${AVROCPP_INCLUDE_DIR}) -include_directories(${Boost_INCLUDE_DIRS}) -include_directories(${SNAPPY_INCLUDE_DIR}) - -ADD_CUSTOM_TARGET(symlink_headers ALL +# create a symlink to include headers with +ADD_CUSTOM_TARGET(avro_symlink_headers ALL COMMAND ${CMAKE_COMMAND} -E make_directory ${AVROCPP_ROOT_DIR}/include COMMAND ${CMAKE_COMMAND} -E create_symlink ${AVROCPP_ROOT_DIR}/api ${AVROCPP_ROOT_DIR}/include/avro -) \ No newline at end of file +) +add_dependencies(avrocpp avro_symlink_headers) \ No newline at end of file diff --git a/contrib/boost b/contrib/boost index a2cfeb63eaf..86be2aef20b 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit a2cfeb63eaf3b32cf233105b1a40f4a5f26b8495 +Subproject commit 86be2aef20bee2356b744e5569eed6eaded85dbe From 6e26211758c2b1feec19f6953468813354568a92 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Thu, 9 Jan 2020 23:59:01 -0600 Subject: [PATCH 04/89] Fix code style --- .../Formats/Impl/AvroRowInputFormat.cpp | 131 ++++++----- .../Formats/Impl/AvroRowOutputFormat.cpp | 215 ++++++++++-------- 2 files changed, 194 insertions(+), 152 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index f7b34a076d3..a0eba94bfdb 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -120,7 +120,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_STRING: if (target.isString()) { - return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable + { decoder.decodeString(tmp); column.insertData(tmp.c_str(), tmp.length()); }; @@ -129,7 +130,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_BYTES: if (target.isString()) { - return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable + { decoder.decodeString(tmp); column.insertData(tmp.c_str(), tmp.length()); }; @@ -138,12 +140,15 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_INT: if (target.isInt32()) { - return - [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeInt()); }; + return [](IColumn & column, avro::Decoder & decoder) + { + assert_cast(column).insertValue(decoder.decodeInt()); + }; } if (target.isDate() && logical_type == avro::LogicalType::DATE) { - return [](IColumn & column, avro::Decoder & decoder) { + return [](IColumn & column, avro::Decoder & decoder) + { assert_cast(column).insertValue(decoder.decodeInt()); }; } @@ -151,8 +156,10 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_LONG: if (target.isInt64()) { - return - [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeLong()); }; + return [](IColumn & column, avro::Decoder & decoder) + { + assert_cast(column).insertValue(decoder.decodeLong()); + }; } if (target.isDateTime64()) { @@ -160,7 +167,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node if ((logical_type == avro::LogicalType::TIMESTAMP_MILLIS && date_time_scale == 3) || (logical_type == avro::LogicalType::TIMESTAMP_MICROS && date_time_scale == 6)) { - return [](IColumn & column, avro::Decoder & decoder) { + return [](IColumn & column, avro::Decoder & decoder) + { assert_cast(column).insertValue(decoder.decodeLong()); }; } @@ -169,7 +177,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_FLOAT: if (target.isFloat32()) { - return [](IColumn & column, avro::Decoder & decoder) { + return [](IColumn & column, avro::Decoder & decoder) + { assert_cast(column).insertValue(decoder.decodeFloat()); }; } @@ -177,7 +186,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_DOUBLE: if (target.isFloat64()) { - return [](IColumn & column, avro::Decoder & decoder) { + return [](IColumn & column, avro::Decoder & decoder) + { assert_cast(column).insertValue(decoder.decodeDouble()); }; } @@ -185,17 +195,20 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_BOOL: if (target.isUInt8()) { - return - [](IColumn & column, avro::Decoder & decoder) { assert_cast(column).insertValue(decoder.decodeBool()); }; + return [](IColumn & column, avro::Decoder & decoder) + { + assert_cast(column).insertValue(decoder.decodeBool()); + }; } break; - case avro::AVRO_ARRAY: { + case avro::AVRO_ARRAY: if (target.isArray()) { auto nested_source_type = root_node->leafAt(0); auto nested_target_type = assert_cast(*target_type).getNestedType(); auto nested_deserialize = createDeserializeFn(nested_source_type, nested_target_type); - return [nested_deserialize](IColumn & column, avro::Decoder & decoder) { + return [nested_deserialize](IColumn & column, avro::Decoder & decoder) + { ColumnArray & column_array = assert_cast(column); ColumnArray::Offsets & offsets = column_array.getOffsets(); IColumn & nested_column = column_array.getData(); @@ -212,11 +225,13 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node }; } break; - } - case avro::AVRO_UNION: { - auto nullable_deserializer = [root_node, target_type](size_t non_null_union_index) { + case avro::AVRO_UNION: + { + auto nullable_deserializer = [root_node, target_type](size_t non_null_union_index) + { auto nested_deserialize = createDeserializeFn(root_node->leafAt(non_null_union_index), removeNullable(target_type)); - return [non_null_union_index, nested_deserialize](IColumn & column, avro::Decoder & decoder) { + return [non_null_union_index, nested_deserialize](IColumn & column, avro::Decoder & decoder) + { ColumnNullable & col = assert_cast(column); size_t union_index = decoder.decodeUnionIndex(); if (union_index == non_null_union_index) @@ -245,14 +260,15 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node auto nested_type = removeNullable(target_type); if (nested_type->getTypeId() == TypeIndex::Nothing) { - return [](IColumn & column, avro::Decoder & decoder) { - (void)column; + return [](IColumn &, avro::Decoder & decoder) + { decoder.decodeNull(); }; } else { - return [](IColumn & column, avro::Decoder & decoder) { + return [](IColumn & column, avro::Decoder & decoder) + { ColumnNullable & col = assert_cast(column); decoder.decodeNull(); col.insertDefault(); @@ -268,7 +284,8 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node { symbols.push_back(root_node->nameAt(i)); } - return [symbols](IColumn & column, avro::Decoder & decoder) { + return [symbols](IColumn & column, avro::Decoder & decoder) + { size_t enum_index = decoder.decodeEnum(); const auto & enum_symbol = symbols[enum_index]; column.insertData(enum_symbol.c_str(), enum_symbol.length()); @@ -282,17 +299,20 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node { symbol_mapping.push_back(enum_type.castToValue(root_node->nameAt(i))); } - return [symbol_mapping](IColumn & column, avro::Decoder & decoder) { + return [symbol_mapping](IColumn & column, avro::Decoder & decoder) + { size_t enum_index = decoder.decodeEnum(); column.insert(symbol_mapping[enum_index]); }; } break; - case avro::AVRO_FIXED: { + case avro::AVRO_FIXED: + { size_t fixed_size = root_node->fixedSize(); if (target.isFixedString() && target_type->getSizeOfValueInMemory() == fixed_size) { - return [tmp_fixed = std::vector(fixed_size)](IColumn & column, avro::Decoder & decoder) mutable { + return [tmp_fixed = std::vector(fixed_size)](IColumn & column, avro::Decoder & decoder) mutable + { decoder.decodeFixed(tmp_fixed.size(), tmp_fixed); column.insertData(reinterpret_cast(tmp_fixed.data()), tmp_fixed.size()); }; @@ -328,9 +348,11 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) return [](avro::Decoder & decoder) { decoder.decodeDouble(); }; case avro::AVRO_BOOL: return [](avro::Decoder & decoder) { decoder.decodeBool(); }; - case avro::AVRO_ARRAY: { + case avro::AVRO_ARRAY: + { auto nested_skip_fn = createSkipFn(root_node->leafAt(0)); - return [nested_skip_fn](avro::Decoder & decoder) { + return [nested_skip_fn](avro::Decoder & decoder) + { for (size_t n = decoder.arrayStart(); n != 0; n = decoder.arrayNext()) { for (size_t i = 0; i < n; ++i) @@ -340,7 +362,8 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) } }; } - case avro::AVRO_UNION: { + case avro::AVRO_UNION: + { std::vector union_skip_fns; for (size_t i = 0; i < root_node->leaves(); i++) { @@ -352,14 +375,16 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) return [](avro::Decoder & decoder) { decoder.decodeNull(); }; case avro::AVRO_ENUM: return [](avro::Decoder & decoder) { decoder.decodeEnum(); }; - case avro::AVRO_FIXED: { + case avro::AVRO_FIXED: + { auto fixed_size = root_node->fixedSize(); return [fixed_size](avro::Decoder & decoder) { decoder.skipFixed(fixed_size); }; } - - case avro::AVRO_MAP: { + case avro::AVRO_MAP: + { auto value_skip_fn = createSkipFn(root_node->leafAt(1)); - return [value_skip_fn](avro::Decoder & decoder) { + return [value_skip_fn](avro::Decoder & decoder) + { for (size_t n = decoder.mapStart(); n != 0; n = decoder.mapNext()) { for (size_t i = 0; i < n; ++i) @@ -370,13 +395,15 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) } }; } - case avro::AVRO_RECORD: { + case avro::AVRO_RECORD: + { std::vector field_skip_fns; for (size_t i = 0; i < root_node->leaves(); i++) { field_skip_fns.push_back(createSkipFn(root_node->leafAt(i))); } - return [field_skip_fns](avro::Decoder & decoder) { + return [field_skip_fns](avro::Decoder & decoder) + { for (auto & skip_fn : field_skip_fns) skip_fn(decoder); }; @@ -579,27 +606,23 @@ AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId void registerInputFormatProcessorAvro(FormatFactory & factory) { - factory.registerInputFormatProcessor( - "Avro", - [=](ReadBuffer & buf, - const Block & sample, - const RowInputFormatParams & params, - const FormatSettings & settings) { - (void)(params); - (void)(settings); - return std::make_shared(sample, buf, params); - }); + factory.registerInputFormatProcessor("Avro", [=]( + ReadBuffer & buf, + const Block & sample, + const RowInputFormatParams & params, + const FormatSettings &) + { + return std::make_shared(sample, buf, params); + }); - factory.registerInputFormatProcessor( - "AvroConfluent", - [=](ReadBuffer & buf, - const Block & sample, - const RowInputFormatParams & params, - const FormatSettings & settings) { - (void)(params); - (void)(settings); - return std::make_shared(sample, buf, params, settings); - }); + factory.registerInputFormatProcessor("AvroConfluent",[=]( + ReadBuffer & buf, + const Block & sample, + const RowInputFormatParams & params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, params, settings); + }); } } diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 786626170fc..56aee6930dc 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -85,36 +85,42 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF switch (data_type->getTypeId()) { case TypeIndex::UInt8: - return {avro::BoolSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeBool(assert_cast(column).getElement(row_num)); - }}; + return {avro::BoolSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeBool(assert_cast(column).getElement(row_num)); + }}; case TypeIndex::Int32: - return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeInt(assert_cast(column).getElement(row_num)); - }}; + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; case TypeIndex::Int64: - return {avro::LongSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeLong(assert_cast(column).getElement(row_num)); - }}; + return {avro::LongSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeLong(assert_cast(column).getElement(row_num)); + }}; case TypeIndex::Float32: - return {avro::FloatSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeFloat(assert_cast(column).getElement(row_num)); - }}; + return {avro::FloatSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeFloat(assert_cast(column).getElement(row_num)); + }}; case TypeIndex::Float64: - return {avro::DoubleSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeDouble(assert_cast(column).getElement(row_num)); - }}; - case TypeIndex::Date: { + return {avro::DoubleSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeDouble(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Date: + { auto schema = avro::IntSchema(); schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::DATE)); - return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - UInt16 date = assert_cast(column).getElement(row_num); - encoder.encodeInt(date); - }}; + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + UInt16 date = assert_cast(column).getElement(row_num); + encoder.encodeInt(date); + }}; } - case TypeIndex::DateTime: - throw Exception("Unsupported Avro type", ErrorCodes::BAD_TYPE_OF_FIELD); - case TypeIndex::DateTime64: { + case TypeIndex::DateTime64: + { auto schema = avro::LongSchema(); const auto & provided_type = assert_cast(*data_type); @@ -123,27 +129,32 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF else if (provided_type.getScale() == 6) schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::TIMESTAMP_MICROS)); else - throw Exception("Unsupported Avro type", ErrorCodes::BAD_TYPE_OF_FIELD); + break; - return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const auto & col = assert_cast(column); - encoder.encodeLong(col.getElement(row_num)); - }}; + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const auto & col = assert_cast(column); + encoder.encodeLong(col.getElement(row_num)); + }}; } case TypeIndex::String: - return {avro::StringSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const StringRef & s = assert_cast(column).getDataAt(row_num); - encoder.encodeBytes(reinterpret_cast(s.data), s.size); - }}; - case TypeIndex::FixedString: { - return {avro::FixedSchema(data_type->getSizeOfValueInMemory(), "fixed"), - [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const StringRef & s = assert_cast(column).getDataAt(row_num); - encoder.encodeFixed(reinterpret_cast(s.data), s.size); - }}; + return {avro::StringSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const StringRef & s = assert_cast(column).getDataAt(row_num); + encoder.encodeBytes(reinterpret_cast(s.data), s.size); + }}; + case TypeIndex::FixedString: + { + auto schema = avro::FixedSchema(data_type->getSizeOfValueInMemory(), "fixed"); + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const StringRef & s = assert_cast(column).getDataAt(row_num); + encoder.encodeFixed(reinterpret_cast(s.data), s.size); + }}; } - case TypeIndex::Enum8: { - auto schema = avro::EnumSchema("enum"); + case TypeIndex::Enum8: + { + auto schema = avro::EnumSchema("enum8"); std::unordered_map enum_mapping; const auto & enum_values = assert_cast(*data_type).getValues(); for (size_t i = 0; i < enum_values.size(); ++i) @@ -151,13 +162,15 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF schema.addSymbol(enum_values[i].first); enum_mapping.emplace(enum_values[i].second, i); } - return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - auto enum_value = assert_cast(column).getElement(row_num); - encoder.encodeEnum(enum_mapping.at(enum_value)); - }}; + return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + auto enum_value = assert_cast(column).getElement(row_num); + encoder.encodeEnum(enum_mapping.at(enum_value)); + }}; } - case TypeIndex::Enum16: { - auto schema = avro::EnumSchema("enum"); + case TypeIndex::Enum16: + { + auto schema = avro::EnumSchema("enum16"); std::unordered_map enum_mapping; const auto & enum_values = assert_cast(*data_type).getValues(); for (size_t i = 0; i < enum_values.size(); ++i) @@ -165,36 +178,40 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF schema.addSymbol(enum_values[i].first); enum_mapping.emplace(enum_values[i].second, i); } - return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - auto enum_value = assert_cast(column).getElement(row_num); - encoder.encodeEnum(enum_mapping.at(enum_value)); - }}; + return {schema, [enum_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + auto enum_value = assert_cast(column).getElement(row_num); + encoder.encodeEnum(enum_mapping.at(enum_value)); + }}; } - case TypeIndex::Array: { + case TypeIndex::Array: + { const auto & array_type = assert_cast(*data_type); auto nested_mapping = createSchemaWithSerializeFn(array_type.getNestedType()); - return {avro::ArraySchema(nested_mapping.schema), - [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - size_t offset = offsets[row_num - 1]; - size_t next_offset = offsets[row_num]; - size_t row_count = next_offset - offset; - const IColumn & nested_column = column_array.getData(); + auto schema = avro::ArraySchema(nested_mapping.schema); + return {schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const ColumnArray & column_array = assert_cast(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + size_t row_count = next_offset - offset; + const IColumn & nested_column = column_array.getData(); - encoder.arrayStart(); - if (row_count > 0) - { - encoder.setItemCount(row_count); - } - for (size_t i = offset; i < next_offset; ++i) - { - nested_mapping.serialize(nested_column, i, encoder); - } - encoder.arrayEnd(); - }}; + encoder.arrayStart(); + if (row_count > 0) + { + encoder.setItemCount(row_count); + } + for (size_t i = offset; i < next_offset; ++i) + { + nested_mapping.serialize(nested_column, i, encoder); + } + encoder.arrayEnd(); + }}; } - case TypeIndex::Nullable: { + case TypeIndex::Nullable: + { auto nested_type = removeNullable(data_type); auto nested_mapping = createSchemaWithSerializeFn(nested_type); if (nested_type->getTypeId() == TypeIndex::Nothing) @@ -206,28 +223,31 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF avro::UnionSchema union_schema; union_schema.addType(avro::NullSchema()); union_schema.addType(nested_mapping.schema); - return {union_schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const ColumnNullable & col = assert_cast(column); - if (!col.isNullAt(row_num)) - { - encoder.encodeUnionIndex(1); - nested_mapping.serialize(col.getNestedColumn(), row_num, encoder); - } - else - { - encoder.encodeUnionIndex(0); - encoder.encodeNull(); - } - }}; + return {union_schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const ColumnNullable & col = assert_cast(column); + if (!col.isNullAt(row_num)) + { + encoder.encodeUnionIndex(1); + nested_mapping.serialize(col.getNestedColumn(), row_num, encoder); + } + else + { + encoder.encodeUnionIndex(0); + encoder.encodeNull(); + } + }}; } } - case TypeIndex::LowCardinality: { + case TypeIndex::LowCardinality: + { const auto & nested_type = removeLowCardinality(data_type); auto nested_mapping = createSchemaWithSerializeFn(nested_type); - return {nested_mapping.schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - const auto & col = assert_cast(column); - nested_mapping.serialize(*col.getDictionary().getNestedColumn(), col.getIndexAt(row_num), encoder); - }}; + return {nested_mapping.schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const auto & col = assert_cast(column); + nested_mapping.serialize(*col.getDictionary().getNestedColumn(), col.getIndexAt(row_num), encoder); + }}; } case TypeIndex::Nothing: return {avro::NullSchema(), [](const IColumn &, size_t, avro::Encoder & encoder) { encoder.encodeNull(); }}; @@ -298,15 +318,14 @@ void AvroRowOutputFormat::writeSuffix() void registerOutputFormatProcessorAvro(FormatFactory & factory) { - factory.registerOutputFormatProcessor( - "Avro", - [=](WriteBuffer & buf, - const Block & sample, - FormatFactory::WriteCallback callback, - const FormatSettings & settings) { - (void)(callback); - return std::make_shared(buf, sample, callback, settings); - }); + factory.registerOutputFormatProcessor("Avro",[=]( + WriteBuffer & buf, + const Block & sample, + FormatFactory::WriteCallback callback, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, callback, settings); + }); } } From d15e820e9fec9c47dc55dc80e6a0b5b6ceed89c0 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Fri, 10 Jan 2020 03:05:20 -0600 Subject: [PATCH 05/89] Add Avro tests --- .../queries/0_stateless/01060_avro.reference | 37 + dbms/tests/queries/0_stateless/01060_avro.sh | 68 ++ .../0_stateless/data_avro/complex.avro | Bin 0 -> 843 bytes .../0_stateless/data_avro/complex.avsc | 20 + .../0_stateless/data_avro/complex.json | 2 + .../queries/0_stateless/data_avro/empty.avro | Bin 0 -> 120 bytes .../queries/0_stateless/data_avro/empty.avsc | 7 + .../queries/0_stateless/data_avro/empty.json | 0 .../0_stateless/data_avro/generate_avro.sh | 14 + .../0_stateless/data_avro/logical_types.avro | Bin 0 -> 361 bytes .../0_stateless/data_avro/logical_types.avsc | 9 + .../0_stateless/data_avro/logical_types.json | 1 + .../0_stateless/data_avro/primitive.avro | Bin 0 -> 427 bytes .../0_stateless/data_avro/primitive.avsc | 14 + .../0_stateless/data_avro/primitive.json | 2 + .../queries/0_stateless/data_avro/simple.avsc | 7 + .../0_stateless/data_avro/simple.deflate.avro | Bin 0 -> 1698 bytes .../queries/0_stateless/data_avro/simple.json | 1000 +++++++++++++++++ .../0_stateless/data_avro/simple.null.avro | Bin 0 -> 2077 bytes .../0_stateless/data_avro/simple.snappy.avro | Bin 0 -> 2088 bytes 20 files changed, 1181 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01060_avro.reference create mode 100755 dbms/tests/queries/0_stateless/01060_avro.sh create mode 100644 dbms/tests/queries/0_stateless/data_avro/complex.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/complex.avsc create mode 100644 dbms/tests/queries/0_stateless/data_avro/complex.json create mode 100644 dbms/tests/queries/0_stateless/data_avro/empty.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/empty.avsc create mode 100644 dbms/tests/queries/0_stateless/data_avro/empty.json create mode 100755 dbms/tests/queries/0_stateless/data_avro/generate_avro.sh create mode 100644 dbms/tests/queries/0_stateless/data_avro/logical_types.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/logical_types.avsc create mode 100644 dbms/tests/queries/0_stateless/data_avro/logical_types.json create mode 100644 dbms/tests/queries/0_stateless/data_avro/primitive.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/primitive.avsc create mode 100644 dbms/tests/queries/0_stateless/data_avro/primitive.json create mode 100644 dbms/tests/queries/0_stateless/data_avro/simple.avsc create mode 100644 dbms/tests/queries/0_stateless/data_avro/simple.deflate.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/simple.json create mode 100644 dbms/tests/queries/0_stateless/data_avro/simple.null.avro create mode 100644 dbms/tests/queries/0_stateless/data_avro/simple.snappy.avro diff --git a/dbms/tests/queries/0_stateless/01060_avro.reference b/dbms/tests/queries/0_stateless/01060_avro.reference new file mode 100644 index 00000000000..d8ee426a337 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01060_avro.reference @@ -0,0 +1,37 @@ +=== input += primitive +1,1,2,3.4,5.6,"b1","s1" +0,-1,9223372036854775807,3.00004,0.00001,"","" +1,2,"s1" +0,9223372036854775807,"" +"s1",2,1 +"",9223372036854775807,0 +"s1" +"" += complex +"A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" +"C","f","[]","[]",\N,123,"79cd909892d7e7ade1987cc7422628ba" +"79cd909892d7e7ade1987cc7422628ba" +"79cd909892d7e7ade1987cc7422628ba" += logical_types +"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000" +18250,1578641516227,1578641516227000 += compression +1000 +1000 +1000 += other +0 +not compatible +not found +=== output += primitive +1,1,2,3.4,5.6,"b1","s1" += complex +"A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" += logical_types +"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000" += other +0 +1000 +not supported diff --git a/dbms/tests/queries/0_stateless/01060_avro.sh b/dbms/tests/queries/0_stateless/01060_avro.sh new file mode 100755 index 00000000000..fbde59e58fa --- /dev/null +++ b/dbms/tests/queries/0_stateless/01060_avro.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +set -e + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CUR_DIR/../shell_config.sh + +DATA_DIR=$CUR_DIR/data_avro + +# input +echo === input +echo = primitive + +cat $DATA_DIR/primitive.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_bool UInt8, b_int Int32, c_long Int64, d_float Float32, e_double Float64, f_bytes String, g_string String' -q 'select * from table' +cat $DATA_DIR/primitive.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_bool UInt8, c_long Int64, g_string String' -q 'select * from table' +cat $DATA_DIR/primitive.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'g_string String, c_long Int64, a_bool UInt8' -q 'select * from table' +cat $DATA_DIR/primitive.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'g_string String' -q 'select * from table' + +echo = complex +cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_string Array(String), d_array_array_string Array(Array(String)), e_union_null_string Nullable(String), f_union_long_null Nullable(Int64), g_fixed FixedString(32)" -q 'select * from table' +cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table' + +echo = logical_types +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Date, b_timestamp_millis DateTime64, c_timestamp_micros DateTime64(6)' -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64' -q 'select * from table' + + + +echo = compression +cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' +cat $DATA_DIR/simple.deflate.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' +cat $DATA_DIR/simple.snappy.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' + +echo = other +#no data +cat $DATA_DIR/empty.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' +# type mismatch +cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int32' -q 'select count() from table' 2>&1 | grep -i 'not compatible' -o +# field not found +cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'b Int64' -q 'select count() from table' 2>&1 | grep -i 'not found' -o + + + + + + +# output +echo === output + +echo = primitive +S1="a_bool UInt8, b_int Int32, c_long Int64, d_float Float32, e_double Float64, f_bytes String, g_string String" +echo '1,1,2,3.4,5.6,"b1","s1"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S1" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S1" -q 'select * from table' + +echo = complex +S2="a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_string Array(String), d_array_array_string Array(Array(String)), e_union_null_string Nullable(String), f_union_long_null Nullable(Int64), g_fixed FixedString(32)" +echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table' + +echo = logical_types +S3="a_date Date, b_timestamp_millis DateTime64, c_timestamp_micros DateTime64(6)" +echo '"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' + +echo = other +S4="a Int64" +${CLICKHOUSE_LOCAL} -q "select toInt64(number) as a from numbers(0) format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S4" -q 'select count() from table' +${CLICKHOUSE_LOCAL} -q "select toInt64(number) as a from numbers(1000) format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S4" -q 'select count() from table' + +# type not supported +${CLICKHOUSE_LOCAL} -q "select toInt16(123) as a format Avro" 2>&1 | grep -i 'not supported' -o \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/complex.avro b/dbms/tests/queries/0_stateless/data_avro/complex.avro new file mode 100644 index 0000000000000000000000000000000000000000..0880f5818829114622dc789466eead14fca4dba7 GIT binary patch literal 843 zcmb7?u};G<5Qfvp127{bR?ih829c&4@BlC{QB{`XIB6h>t2hA)$}=$VHt-ItEG*2d zh!Jsd(vSwKib%2apM9V2&i{#f(L-D8DwR8mPejZfH|}sYn+iO{so*m8a33cu)>>YJ z@Iw)K3J*tj^Ejg-$zz&HsUHt_`3i)h`#`f+O}w zdnOZF8HTFMaQd6ob(6~7CiI7=8o<2C0*o;(Hnml`^aNik`!McLzqZewUS6}e^N)x1 z_W(Wj3*iB4!z|5Cc|anYP0{XV)1x#IuFXJYhKln{#`gkOMMsnaPl8!1vi( IeX@J=H!X4%b^rhX literal 0 HcmV?d00001 diff --git a/dbms/tests/queries/0_stateless/data_avro/complex.avsc b/dbms/tests/queries/0_stateless/data_avro/complex.avsc new file mode 100644 index 00000000000..325169aeb57 --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/complex.avsc @@ -0,0 +1,20 @@ +{ + "type": "record", + "name": "row", + "fields": [ + {"name": "a_enum_to_string", "type": { "type": "enum", "name": "enum_1", "symbols" : ["A", "B", "C"]}}, + {"name": "b_enum_to_enum", "type": { "type": "enum", "name": "enum_2", "symbols" : ["t", "f"]}}, + {"name": "c_array_string", "type": { "type": "array", "items": "string"}}, + {"name": "d_array_array_string", "type": { "type": "array", "items": {"type": "array", "items": "string"}}}, + {"name": "e_union_null_string", "type": ["null", "string"]}, + {"name": "f_union_long_null", "type": ["long", "null"]}, + {"name": "g_fixed", "type": {"type":"fixed", "size": 32, "name": "fixed_1"}}, + {"name": "h_record_skip", "type": { + "type": "record", + "name": "subrecord", + "fields": [ + {"name": "a", "type": "string"} + ] + }} + ] + } \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/complex.json b/dbms/tests/queries/0_stateless/data_avro/complex.json new file mode 100644 index 00000000000..d05e09c72fc --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/complex.json @@ -0,0 +1,2 @@ +{"a_enum_to_string":"A","b_enum_to_enum":"t","c_array_string":["s1", "s2"],"d_array_array_string":[["a1"], ["a2"]],"e_union_null_string":{"string": "s1"},"f_union_long_null":null,"g_fixed":"79cd909892d7e7ade1987cc7422628ba","h_record_skip":{"a": "a"}} +{"a_enum_to_string":"C","b_enum_to_enum":"f","c_array_string":[],"d_array_array_string":[],"e_union_null_string":null,"f_union_long_null":{"long": 123},"g_fixed":"79cd909892d7e7ade1987cc7422628ba","h_record_skip":{"a": "a"}} \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/empty.avro b/dbms/tests/queries/0_stateless/data_avro/empty.avro new file mode 100644 index 0000000000000000000000000000000000000000..7cfae81758cf61917da409cd2a9ad03d3af2130c GIT binary patch literal 120 zcmeZI%3@>@ODrqO*DFrWNX<>`V60XusVqoUvQjEaP0lY$QPNS$OUwoH^UHypw9M3; zlwu{T=xUgFB2W~nB_}^GU8y# primitive.avro +avro-tools fromjson --schema-file complex.avsc complex.json > complex.avro +avro-tools fromjson --schema-file logical_types.avsc logical_types.json > logical_types.avro +avro-tools fromjson --schema-file empty.avsc empty.json > empty.avro + +#compression +avro-tools fromjson --codec null --schema-file simple.avsc simple.json > simple.null.avro +avro-tools fromjson --codec deflate --schema-file simple.avsc simple.json > simple.deflate.avro +avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/logical_types.avro b/dbms/tests/queries/0_stateless/data_avro/logical_types.avro new file mode 100644 index 0000000000000000000000000000000000000000..7b8a3f60b7a078a39b5bf6b2f7d841ccde80c056 GIT binary patch literal 361 zcmeZI%3@>@ODrqO*DFrWNX<=L!cwhNQdy9yWTjM;nw(#hqNJmgmzWFY=a&OHX_=`x zDaA@w(bX{V#Q2oNl2o7+L?2v#W?l(UFeg7fGdVFQ1Y#ywWo@kvTys)@ODrqO*DFrWNX<=r!&0qOQdy9yWTjM;nw(#hqNJmgmzWFY=a&OHX_=`x zDaA@w(bX{V#Q3EA{2ZVZR3C_!nwY0ls{>b(6rY(_f=~hCAju}j=j7+5Bc#DRB>9y1 zw4D4zB>fOBl8V&$l>E}9oK%DvP(G5HwD_dTlGI{^5(pPbMS6U3Nl_-UOQ3utH5u`F zr8!8R1@mfSYem3en4F)In#=(bV_3N`eD0eSbLGoYi6WF8W@N< hup}9>6dN)y{`(IFj4K!%96n||?%4X{i5&xm0RZw(n9l$J literal 0 HcmV?d00001 diff --git a/dbms/tests/queries/0_stateless/data_avro/primitive.avsc b/dbms/tests/queries/0_stateless/data_avro/primitive.avsc new file mode 100644 index 00000000000..a4f06d02b01 --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/primitive.avsc @@ -0,0 +1,14 @@ +{ + "type": "record", + "name": "row", + "fields": [ + {"name": "a_bool", "type": "boolean"}, + {"name": "b_int", "type": "int"}, + {"name": "c_long", "type": "long"}, + {"name": "d_float", "type": "float"}, + {"name": "e_double", "type": "double"}, + {"name": "f_bytes", "type": "bytes"}, + {"name": "g_string", "type": "string"}, + {"name": "h_null", "type": "null"} + ] + } \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/primitive.json b/dbms/tests/queries/0_stateless/data_avro/primitive.json new file mode 100644 index 00000000000..fc521c8829c --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/primitive.json @@ -0,0 +1,2 @@ +{"a_bool":true,"b_int":1,"c_long":2,"d_float":3.4,"e_double":5.6,"f_bytes":"b1","g_string":"s1","h_null": null} +{"a_bool":false,"b_int":-1,"c_long":9223372036854775807,"d_float":3.00004,"e_double":0.00001,"f_bytes":"","g_string":"","h_null": null} \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/simple.avsc b/dbms/tests/queries/0_stateless/data_avro/simple.avsc new file mode 100644 index 00000000000..923eda71054 --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/simple.avsc @@ -0,0 +1,7 @@ +{ + "type": "record", + "name": "row", + "fields": [ + {"name": "a", "type": "long"} + ] + } \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/data_avro/simple.deflate.avro b/dbms/tests/queries/0_stateless/data_avro/simple.deflate.avro new file mode 100644 index 0000000000000000000000000000000000000000..d4ba226b44753945fc570014d43472800f3a9af4 GIT binary patch literal 1698 zcmZuyTS#LE0`0`EkHqAH7F`rnt|2Bj@kSC!>^1I(g65$KC@3ha@h0YRb8~OF3DM@! z`Hjy&3%e})p@|4^l*O8fph*2 zM}uMy-mIH;QY38GnP#lhz6Wm5@s4cDTW6A~;DHLGRzaZ~fM&BlR;nmVU;7PdjGb(Ej5{d?Ydn^*i1c{~wImw~gxe*or@6>JfmZKS->;D%C$Kejt4KV|eP-t@P2) zQS#x_55IeL2mG!0z5m_P`>S__r;m$2#^3$r-{0N6{OMuwht<2%KOf({xVSGqeSG)J zDscbmi~p>8ZXf4lA2U231GJXP-j-^xg^sqcYFes>TbQC22Gs)4*D953)zBIpTVpk> zRgJGPku`>E4Jg-E_G_y-TDn2Ys?%1DYMBx(!>I*W$Cc{iYWSE=9J88^t0s?`vLlA~ z2q^2Q?CYp*?V#&AShXEhBOOd}2gBY0Ft#dHTh;t6x@C*ixK%Z=#q8cFUy`RdY|~qx_o_IORZfCx~`SluKAI!WpUS{z01$o^{IB3 z_`3_1-Id1O`H9`-?%hTAuD@c=H!!!N?f6or;mpA#tb5-l<{i$W=Ro{2irbN8PxinAjQW-ch-CG!+i{fMc-Dp)@+w z^$x|DV@T>y%{nx|wOn&OD7;pZ*XrhL#nkmsPv*dr$LIy6R zi9)uikeVzU%Dz4Dew!w9d20Xjjy*O>wMR`upH&~qXG^J8Bn+m z<&C245|r;mg{%{<`a}SqK*R~$bi$iFX_tM;_kJlX6LI@Qf>sfv6T!72-iWAOEaKZm zLdG^%wJqRpLzZp0aho@>-QK;;cW(&${BkZBTO?5#EN#+gS7M=G1Mp=W~nabCSj}^>T2KgP|OvhGQP)SVbHQ#Ubed zlQMuq0SpTe4FU6bz={Mct^iqXF!dX7jsY_mh&qFL)L@kuEKUQ-Iy0%yaQF-(&WNTn z^W>RT_O-?PHCZMx^-1tn38s?}wG#7)#446p>=KfZFsTwaKY>{iL}S7{k+60rEbatZ z;WQ06@ir%BbQ1MW^O(~rby{YfByeNW+~C3+jJzS5Z_HCS)}HKhPxgWa=X>D-2%n?y y%NqFEF#JLUKd0aedMvMu6`fyAD>Vi)B^zMm*?Uf%!lCVmBbUD7}R literal 0 HcmV?d00001 diff --git a/dbms/tests/queries/0_stateless/data_avro/simple.json b/dbms/tests/queries/0_stateless/data_avro/simple.json new file mode 100644 index 00000000000..c09fc0b732f --- /dev/null +++ b/dbms/tests/queries/0_stateless/data_avro/simple.json @@ -0,0 +1,1000 @@ +{"a":1} +{"a":2} +{"a":3} +{"a":4} +{"a":5} +{"a":6} +{"a":7} +{"a":8} +{"a":9} +{"a":10} +{"a":11} +{"a":12} +{"a":13} +{"a":14} +{"a":15} +{"a":16} +{"a":17} +{"a":18} +{"a":19} +{"a":20} +{"a":21} +{"a":22} +{"a":23} +{"a":24} +{"a":25} +{"a":26} +{"a":27} +{"a":28} +{"a":29} +{"a":30} +{"a":31} +{"a":32} +{"a":33} +{"a":34} +{"a":35} +{"a":36} +{"a":37} +{"a":38} +{"a":39} +{"a":40} +{"a":41} +{"a":42} +{"a":43} +{"a":44} +{"a":45} +{"a":46} +{"a":47} +{"a":48} +{"a":49} +{"a":50} +{"a":51} +{"a":52} +{"a":53} +{"a":54} +{"a":55} +{"a":56} +{"a":57} +{"a":58} +{"a":59} +{"a":60} +{"a":61} +{"a":62} +{"a":63} +{"a":64} +{"a":65} +{"a":66} +{"a":67} +{"a":68} +{"a":69} +{"a":70} +{"a":71} +{"a":72} +{"a":73} +{"a":74} +{"a":75} +{"a":76} +{"a":77} +{"a":78} +{"a":79} +{"a":80} +{"a":81} +{"a":82} +{"a":83} +{"a":84} +{"a":85} +{"a":86} +{"a":87} +{"a":88} +{"a":89} +{"a":90} +{"a":91} +{"a":92} +{"a":93} +{"a":94} +{"a":95} +{"a":96} +{"a":97} +{"a":98} +{"a":99} +{"a":100} +{"a":101} +{"a":102} +{"a":103} +{"a":104} +{"a":105} +{"a":106} +{"a":107} +{"a":108} +{"a":109} +{"a":110} +{"a":111} +{"a":112} +{"a":113} +{"a":114} +{"a":115} +{"a":116} +{"a":117} +{"a":118} +{"a":119} +{"a":120} +{"a":121} +{"a":122} +{"a":123} +{"a":124} +{"a":125} +{"a":126} +{"a":127} +{"a":128} +{"a":129} +{"a":130} +{"a":131} +{"a":132} +{"a":133} +{"a":134} +{"a":135} +{"a":136} +{"a":137} +{"a":138} +{"a":139} +{"a":140} +{"a":141} +{"a":142} +{"a":143} +{"a":144} +{"a":145} +{"a":146} +{"a":147} +{"a":148} +{"a":149} +{"a":150} +{"a":151} +{"a":152} +{"a":153} +{"a":154} +{"a":155} +{"a":156} +{"a":157} +{"a":158} +{"a":159} +{"a":160} +{"a":161} +{"a":162} +{"a":163} +{"a":164} +{"a":165} +{"a":166} +{"a":167} +{"a":168} +{"a":169} +{"a":170} +{"a":171} +{"a":172} +{"a":173} +{"a":174} +{"a":175} +{"a":176} +{"a":177} +{"a":178} +{"a":179} +{"a":180} +{"a":181} +{"a":182} +{"a":183} +{"a":184} +{"a":185} +{"a":186} +{"a":187} +{"a":188} +{"a":189} +{"a":190} +{"a":191} +{"a":192} +{"a":193} +{"a":194} +{"a":195} +{"a":196} +{"a":197} +{"a":198} +{"a":199} +{"a":200} +{"a":201} +{"a":202} +{"a":203} +{"a":204} +{"a":205} +{"a":206} +{"a":207} +{"a":208} +{"a":209} +{"a":210} +{"a":211} +{"a":212} +{"a":213} +{"a":214} +{"a":215} +{"a":216} +{"a":217} +{"a":218} +{"a":219} +{"a":220} +{"a":221} +{"a":222} +{"a":223} +{"a":224} +{"a":225} +{"a":226} +{"a":227} +{"a":228} +{"a":229} +{"a":230} +{"a":231} +{"a":232} +{"a":233} +{"a":234} +{"a":235} +{"a":236} +{"a":237} +{"a":238} +{"a":239} +{"a":240} +{"a":241} +{"a":242} +{"a":243} +{"a":244} +{"a":245} +{"a":246} +{"a":247} +{"a":248} +{"a":249} +{"a":250} +{"a":251} +{"a":252} +{"a":253} +{"a":254} +{"a":255} +{"a":256} +{"a":257} +{"a":258} +{"a":259} +{"a":260} +{"a":261} +{"a":262} +{"a":263} +{"a":264} +{"a":265} +{"a":266} +{"a":267} +{"a":268} +{"a":269} +{"a":270} +{"a":271} +{"a":272} +{"a":273} +{"a":274} +{"a":275} +{"a":276} +{"a":277} +{"a":278} +{"a":279} +{"a":280} +{"a":281} +{"a":282} +{"a":283} +{"a":284} +{"a":285} +{"a":286} +{"a":287} +{"a":288} +{"a":289} +{"a":290} +{"a":291} +{"a":292} +{"a":293} +{"a":294} +{"a":295} +{"a":296} +{"a":297} +{"a":298} +{"a":299} +{"a":300} +{"a":301} +{"a":302} +{"a":303} +{"a":304} +{"a":305} +{"a":306} +{"a":307} +{"a":308} +{"a":309} +{"a":310} +{"a":311} +{"a":312} +{"a":313} +{"a":314} +{"a":315} +{"a":316} +{"a":317} +{"a":318} +{"a":319} +{"a":320} +{"a":321} +{"a":322} +{"a":323} +{"a":324} +{"a":325} +{"a":326} +{"a":327} +{"a":328} +{"a":329} +{"a":330} +{"a":331} +{"a":332} +{"a":333} +{"a":334} +{"a":335} +{"a":336} +{"a":337} +{"a":338} +{"a":339} +{"a":340} +{"a":341} +{"a":342} +{"a":343} +{"a":344} +{"a":345} +{"a":346} +{"a":347} +{"a":348} +{"a":349} +{"a":350} +{"a":351} +{"a":352} +{"a":353} +{"a":354} +{"a":355} +{"a":356} +{"a":357} +{"a":358} +{"a":359} +{"a":360} +{"a":361} +{"a":362} +{"a":363} +{"a":364} +{"a":365} +{"a":366} +{"a":367} +{"a":368} +{"a":369} +{"a":370} +{"a":371} +{"a":372} +{"a":373} +{"a":374} +{"a":375} +{"a":376} +{"a":377} +{"a":378} +{"a":379} +{"a":380} +{"a":381} +{"a":382} +{"a":383} +{"a":384} +{"a":385} +{"a":386} +{"a":387} +{"a":388} +{"a":389} +{"a":390} +{"a":391} +{"a":392} +{"a":393} +{"a":394} +{"a":395} +{"a":396} +{"a":397} +{"a":398} +{"a":399} +{"a":400} +{"a":401} +{"a":402} +{"a":403} +{"a":404} +{"a":405} +{"a":406} +{"a":407} +{"a":408} +{"a":409} +{"a":410} +{"a":411} +{"a":412} +{"a":413} +{"a":414} +{"a":415} +{"a":416} +{"a":417} +{"a":418} +{"a":419} +{"a":420} +{"a":421} +{"a":422} +{"a":423} +{"a":424} +{"a":425} +{"a":426} +{"a":427} +{"a":428} +{"a":429} +{"a":430} +{"a":431} +{"a":432} +{"a":433} +{"a":434} +{"a":435} +{"a":436} +{"a":437} +{"a":438} +{"a":439} +{"a":440} +{"a":441} +{"a":442} +{"a":443} +{"a":444} +{"a":445} +{"a":446} +{"a":447} +{"a":448} +{"a":449} +{"a":450} +{"a":451} +{"a":452} +{"a":453} +{"a":454} +{"a":455} +{"a":456} +{"a":457} +{"a":458} +{"a":459} +{"a":460} +{"a":461} +{"a":462} +{"a":463} +{"a":464} +{"a":465} +{"a":466} +{"a":467} +{"a":468} +{"a":469} +{"a":470} +{"a":471} +{"a":472} +{"a":473} +{"a":474} +{"a":475} +{"a":476} +{"a":477} +{"a":478} +{"a":479} +{"a":480} +{"a":481} +{"a":482} +{"a":483} +{"a":484} +{"a":485} +{"a":486} +{"a":487} +{"a":488} +{"a":489} +{"a":490} +{"a":491} +{"a":492} +{"a":493} +{"a":494} +{"a":495} +{"a":496} +{"a":497} +{"a":498} +{"a":499} +{"a":500} +{"a":501} +{"a":502} +{"a":503} +{"a":504} +{"a":505} +{"a":506} +{"a":507} +{"a":508} +{"a":509} +{"a":510} +{"a":511} +{"a":512} +{"a":513} +{"a":514} +{"a":515} +{"a":516} +{"a":517} +{"a":518} +{"a":519} +{"a":520} +{"a":521} +{"a":522} +{"a":523} +{"a":524} +{"a":525} +{"a":526} +{"a":527} +{"a":528} +{"a":529} +{"a":530} +{"a":531} +{"a":532} +{"a":533} +{"a":534} +{"a":535} +{"a":536} +{"a":537} +{"a":538} +{"a":539} +{"a":540} +{"a":541} +{"a":542} +{"a":543} +{"a":544} +{"a":545} +{"a":546} +{"a":547} +{"a":548} +{"a":549} +{"a":550} +{"a":551} +{"a":552} +{"a":553} +{"a":554} +{"a":555} +{"a":556} +{"a":557} +{"a":558} +{"a":559} +{"a":560} +{"a":561} +{"a":562} +{"a":563} +{"a":564} +{"a":565} +{"a":566} +{"a":567} +{"a":568} +{"a":569} +{"a":570} +{"a":571} +{"a":572} +{"a":573} +{"a":574} +{"a":575} +{"a":576} +{"a":577} +{"a":578} +{"a":579} +{"a":580} +{"a":581} +{"a":582} +{"a":583} +{"a":584} +{"a":585} +{"a":586} +{"a":587} +{"a":588} +{"a":589} +{"a":590} +{"a":591} +{"a":592} +{"a":593} +{"a":594} +{"a":595} +{"a":596} +{"a":597} +{"a":598} +{"a":599} +{"a":600} +{"a":601} +{"a":602} +{"a":603} +{"a":604} +{"a":605} +{"a":606} +{"a":607} +{"a":608} +{"a":609} +{"a":610} +{"a":611} +{"a":612} +{"a":613} +{"a":614} +{"a":615} +{"a":616} +{"a":617} +{"a":618} +{"a":619} +{"a":620} +{"a":621} +{"a":622} +{"a":623} +{"a":624} +{"a":625} +{"a":626} +{"a":627} +{"a":628} +{"a":629} +{"a":630} +{"a":631} +{"a":632} +{"a":633} +{"a":634} +{"a":635} +{"a":636} +{"a":637} +{"a":638} +{"a":639} +{"a":640} +{"a":641} +{"a":642} +{"a":643} +{"a":644} +{"a":645} +{"a":646} +{"a":647} +{"a":648} +{"a":649} +{"a":650} +{"a":651} +{"a":652} +{"a":653} +{"a":654} +{"a":655} +{"a":656} +{"a":657} +{"a":658} +{"a":659} +{"a":660} +{"a":661} +{"a":662} +{"a":663} +{"a":664} +{"a":665} +{"a":666} +{"a":667} +{"a":668} +{"a":669} +{"a":670} +{"a":671} +{"a":672} +{"a":673} +{"a":674} +{"a":675} +{"a":676} +{"a":677} +{"a":678} +{"a":679} +{"a":680} +{"a":681} +{"a":682} +{"a":683} +{"a":684} +{"a":685} +{"a":686} +{"a":687} +{"a":688} +{"a":689} +{"a":690} +{"a":691} +{"a":692} +{"a":693} +{"a":694} +{"a":695} +{"a":696} +{"a":697} +{"a":698} +{"a":699} +{"a":700} +{"a":701} +{"a":702} +{"a":703} +{"a":704} +{"a":705} +{"a":706} +{"a":707} +{"a":708} +{"a":709} +{"a":710} +{"a":711} +{"a":712} +{"a":713} +{"a":714} +{"a":715} +{"a":716} +{"a":717} +{"a":718} +{"a":719} +{"a":720} +{"a":721} +{"a":722} +{"a":723} +{"a":724} +{"a":725} +{"a":726} +{"a":727} +{"a":728} +{"a":729} +{"a":730} +{"a":731} +{"a":732} +{"a":733} +{"a":734} +{"a":735} +{"a":736} +{"a":737} +{"a":738} +{"a":739} +{"a":740} +{"a":741} +{"a":742} +{"a":743} +{"a":744} +{"a":745} +{"a":746} +{"a":747} +{"a":748} +{"a":749} +{"a":750} +{"a":751} +{"a":752} +{"a":753} +{"a":754} +{"a":755} +{"a":756} +{"a":757} +{"a":758} +{"a":759} +{"a":760} +{"a":761} +{"a":762} +{"a":763} +{"a":764} +{"a":765} +{"a":766} +{"a":767} +{"a":768} +{"a":769} +{"a":770} +{"a":771} +{"a":772} +{"a":773} +{"a":774} +{"a":775} +{"a":776} +{"a":777} +{"a":778} +{"a":779} +{"a":780} +{"a":781} +{"a":782} +{"a":783} +{"a":784} +{"a":785} +{"a":786} +{"a":787} +{"a":788} +{"a":789} +{"a":790} +{"a":791} +{"a":792} +{"a":793} +{"a":794} +{"a":795} +{"a":796} +{"a":797} +{"a":798} +{"a":799} +{"a":800} +{"a":801} +{"a":802} +{"a":803} +{"a":804} +{"a":805} +{"a":806} +{"a":807} +{"a":808} +{"a":809} +{"a":810} +{"a":811} +{"a":812} +{"a":813} +{"a":814} +{"a":815} +{"a":816} +{"a":817} +{"a":818} +{"a":819} +{"a":820} +{"a":821} +{"a":822} +{"a":823} +{"a":824} +{"a":825} +{"a":826} +{"a":827} +{"a":828} +{"a":829} +{"a":830} +{"a":831} +{"a":832} +{"a":833} +{"a":834} +{"a":835} +{"a":836} +{"a":837} +{"a":838} +{"a":839} +{"a":840} +{"a":841} +{"a":842} +{"a":843} +{"a":844} +{"a":845} +{"a":846} +{"a":847} +{"a":848} +{"a":849} +{"a":850} +{"a":851} +{"a":852} +{"a":853} +{"a":854} +{"a":855} +{"a":856} +{"a":857} +{"a":858} +{"a":859} +{"a":860} +{"a":861} +{"a":862} +{"a":863} +{"a":864} +{"a":865} +{"a":866} +{"a":867} +{"a":868} +{"a":869} +{"a":870} +{"a":871} +{"a":872} +{"a":873} +{"a":874} +{"a":875} +{"a":876} +{"a":877} +{"a":878} +{"a":879} +{"a":880} +{"a":881} +{"a":882} +{"a":883} +{"a":884} +{"a":885} +{"a":886} +{"a":887} +{"a":888} +{"a":889} +{"a":890} +{"a":891} +{"a":892} +{"a":893} +{"a":894} +{"a":895} +{"a":896} +{"a":897} +{"a":898} +{"a":899} +{"a":900} +{"a":901} +{"a":902} +{"a":903} +{"a":904} +{"a":905} +{"a":906} +{"a":907} +{"a":908} +{"a":909} +{"a":910} +{"a":911} +{"a":912} +{"a":913} +{"a":914} +{"a":915} +{"a":916} +{"a":917} +{"a":918} +{"a":919} +{"a":920} +{"a":921} +{"a":922} +{"a":923} +{"a":924} +{"a":925} +{"a":926} +{"a":927} +{"a":928} +{"a":929} +{"a":930} +{"a":931} +{"a":932} +{"a":933} +{"a":934} +{"a":935} +{"a":936} +{"a":937} +{"a":938} +{"a":939} +{"a":940} +{"a":941} +{"a":942} +{"a":943} +{"a":944} +{"a":945} +{"a":946} +{"a":947} +{"a":948} +{"a":949} +{"a":950} +{"a":951} +{"a":952} +{"a":953} +{"a":954} +{"a":955} +{"a":956} +{"a":957} +{"a":958} +{"a":959} +{"a":960} +{"a":961} +{"a":962} +{"a":963} +{"a":964} +{"a":965} +{"a":966} +{"a":967} +{"a":968} +{"a":969} +{"a":970} +{"a":971} +{"a":972} +{"a":973} +{"a":974} +{"a":975} +{"a":976} +{"a":977} +{"a":978} +{"a":979} +{"a":980} +{"a":981} +{"a":982} +{"a":983} +{"a":984} +{"a":985} +{"a":986} +{"a":987} +{"a":988} +{"a":989} +{"a":990} +{"a":991} +{"a":992} +{"a":993} +{"a":994} +{"a":995} +{"a":996} +{"a":997} +{"a":998} +{"a":999} +{"a":1000} diff --git a/dbms/tests/queries/0_stateless/data_avro/simple.null.avro b/dbms/tests/queries/0_stateless/data_avro/simple.null.avro new file mode 100644 index 0000000000000000000000000000000000000000..789ab45101fd0076e5fd966c654d2abf61826f36 GIT binary patch literal 2077 zcmZ9Nf8_f40mt|H{rmNvIcH|he9Sg;&dki5nKNg$nQdlf_RGHa{eEw^^S$r)d~dtk zcDqSSk|arzBuSDaNs=TgB7a58nBNpMCH#y!pq+PaeH@_3E!b z{GWgDvHkDl$+M?_xcca~AHDEN@!_*ak00`eKmO^@ ze*TMJ{^~cs`~4e_-hAurCr{sf{-^i<^0&YL!v);H1H8Zo{2%~=AOyl70-_)W;vfN% zAO+GO1F|3o@}K~UpajaG0;-?}>YxFdpat5X1G=CG`d|QtUqi35~g4pW?&ZP zU>+7=5td*XR$vv@U>!DK6SiO*c3>CwU>^?P5RTv&Uc(!B3n%anPT>sB;Q}t<3a;S> zZs88@;XORSBRs(~yg(OnBM$k7>c6=N}?1>qYTQT9Ll2tDxwl9 zqYA2`8mglPYN8fuqYmn#9_ph38ln*zqib}7ZqWqYp(&c7Ia;74TA?-Cpe@>=J-SB+ zbVMg~Mi=D5ZtTHc?8AN>z(E|sVI09x9K&&(z)76KX`I1XoWprsz(ribWn95kT*GzT zz)jr3ZQQ|K+{1l5z(YL3V|3|OD zh>qzsy`i^sLhtC5&gh&j=#sAJnr`Tp?&zN0(*r%y6Ft)lbul;dFfa2lKMSxR3$ZYZ zuqcbMI7_f3OR+S|uq?~5JS(swE3q=GuqvyuI%}{dYq2)#urBMdJ{zzh8?iCFW;g7X zP1qfqvKgDR1zWNeTeA(@vK`yAdv;()c4B9CVJ`0G9`5Bn?&kp>E!Xv!GC;TEHf+8fsA|j$9CgLI?k|HJ2A|tXQ zC-R~oilQXSq9UrIChDRgnxZAzq9eMZC;DO_hGHbf;#%B@TQL!LVk%~0E*4@bR$?tS zVk>rHFYd)b9K}hT#YMQJTY98d`lMe5WKf1=SVm-2#$;S3WKyPNT4rQc=44(LWKou6 zSyp6K)?{5aWK*_eTXtku_GDiU}SYFE;c`GOKPEO@a&gDWbTv%E-`ax0JWDxdPJfC{RR3af~Us+fwagi5NEN~?^@s+`KJf-0(#DyxdBs+y{+ zhH9#oYO9Xws-Ehrff}lj8mnt{qi)qi-KnXXskvIHrCO=A+NiDCslB>a2X#~@bygSU z(r)e1UhUI<9ne7?(qSFZQ61B9ozO|0(rKO1S)J2)UC>2c(q&!IRbA6{-Ox?l(rw+* zUER}tJL`fb1lZODdg z#71q*#%;nTZOW!?#%68K=54_iZON8x#a3<2)@{Q!ZOgW8$98Sc_U*t9?Z}SpwY{;o zc4F`B)XwbOF6`2-?AmVZ*6!@y-rIvc+LJxoi*-3}$K!Y%pW}A|PS6QCVJG55otP7M z5>C=dIcX>3WSyLocM49?DLG}Q;#8fQQ+FCp(`h+vr{i>;p3`>*&d?b-|9{f|0mqXU A_y7O^ literal 0 HcmV?d00001 diff --git a/dbms/tests/queries/0_stateless/data_avro/simple.snappy.avro b/dbms/tests/queries/0_stateless/data_avro/simple.snappy.avro new file mode 100644 index 0000000000000000000000000000000000000000..b812ed6c7ea976a74e536d1997d543f1e9962376 GIT binary patch literal 2088 zcmZ9NeW>bp0f+m(zkh!t#)ya!hfxt@L`B4ih=_=YiWm{S(cQgwt#Nnn3`Z6On_+K@M^-5jly8h=?4dG3uZGd>;5b|9*b*`da|MaPPfm zPhNTF{+kcqzUKoUT)g}K(}x$YUOanv|H-ol7cXBtzW4Sg{N#@x!yA8i_~^kq7q9;6 zga7$^AKU*<9zA*d`-=~M^WpQK6z@NI@bJEV=kdL#Pv8I4^YefI>uaZ%g3pQ9{{D|I zCO;ZEl^6f!>_ZG<%B8l?efs$qKJ(enz4-Zym%i|&FTecCSHAl7Z+!FBZ+-ha-~HbA zfAGT}{rIOpd+q1H_~oyE`@7d4yz%B+j~+jL_U?Os`tx7@>H==y0bbw(eh>gb5CUNk z0Z|YGagYE>kOFCt0a=g(c~AgFPy%I80aZ`~bqi3 z5~g4pW?&ZPU>+7=5td*XR$vv@U>!DK6SiO*c3>CwU>^?P5RTv_9K#8`f>U@6XYdBj z;Q}t<3f{ss+`ui|!96^{BRs)7c!n=HB?6p)I=@RMjg~eJ=8}7G(;nGiN{ZXq3iioF-_Jrf8aGXqM(^o)&14mS~w)XqDDzoi=EbwrHDn zXqWbApAP7dj_4&F(+RzzQ+iEj^oGvqf-dQb-qJPQ&@J83Jw4DPJ<&UQrY`1Y9_D2} z=4SyGWFZ!25f)`J7H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z>?WG&Wa9oA(%)@K7Y zWFvOT#%#i_*pywf8M|R~wqQ%PVz+F~Hf+mwY|jqt$WH8zotcZfxrckXkNbIm2YHBx zd4xxKjK_I`CwYped4^|sj^}xS7kP=7d4*Sbjn{dDH+hSJVkh?EAdccB?!;NRq+5EVSNf!124ql%WLQRIRK{dnCS+2kWLjoq zR_0`07GzPDWLZ{ZRn}x(He^$_WLtJ*SN3FI4&+ddX)m1&!R|7RvBXz09YND>xR9&l?x>0krP)oH^w`#36YO8i? zuMX;{PU=pbl}o#|M|-tT`*lDEbx4PGL`QW@$8|y{bxNmoMrU9IFZ5Ec^sQd&jo#{=-s^)t>XW|H zXYDd><1t?2Gky~=K@&1z6ERT}GjWqJNs}^ZlQCJ7GkH@mMN=|mQ!!OjGj-E2P17=M z(=lDsGkr5KLo+g$W^5+r%1q6*nVB0iHw&{gD|2hsW@ENyXZGe`j^;Y|)l%*;Z`T)@dZWSp#%bMj8XDLN&m>{OhpQ*-K0!)ZD#r|oo{uG4e+&cGQu NBj=x|eE9$0`aema9i{*P literal 0 HcmV?d00001 From 7d9a6130b61e3eb072fb35b17cfd439a5120efc4 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Fri, 10 Jan 2020 03:28:58 -0600 Subject: [PATCH 06/89] Default to deflate if snappy is not enabled --- dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp | 8 +++++++- dbms/tests/queries/0_stateless/01060_avro.reference | 1 - dbms/tests/queries/0_stateless/01060_avro.sh | 4 +++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 56aee6930dc..603eb05528d 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -42,6 +42,12 @@ #include #include +#define DEFAULT_SYNC_INTERVAL 16*1024 +#ifdef SNAPPY_CODEC_AVAILABLE +#define DEFAULT_CODEC avro::Codec::SNAPPY_CODEC +#else +#define DEFAULT_CODEC avro::Codec::DEFLATE_CODEC +#endif namespace DB { @@ -293,7 +299,7 @@ AvroRowOutputFormat::AvroRowOutputFormat( : IRowOutputFormat(header_, out_, callback) , settings(settings_) , serializer(header_.getColumnsWithTypeAndName()) - , file_writer(std::make_unique(out_), serializer.getSchema(), 16 * 1024, avro::Codec::SNAPPY_CODEC) + , file_writer(std::make_unique(out_), serializer.getSchema(), DEFAULT_SYNC_INTERVAL, DEFAULT_CODEC) { } diff --git a/dbms/tests/queries/0_stateless/01060_avro.reference b/dbms/tests/queries/0_stateless/01060_avro.reference index d8ee426a337..a304a2b1f75 100644 --- a/dbms/tests/queries/0_stateless/01060_avro.reference +++ b/dbms/tests/queries/0_stateless/01060_avro.reference @@ -19,7 +19,6 @@ = compression 1000 1000 -1000 = other 0 not compatible diff --git a/dbms/tests/queries/0_stateless/01060_avro.sh b/dbms/tests/queries/0_stateless/01060_avro.sh index fbde59e58fa..1d37db93f6a 100755 --- a/dbms/tests/queries/0_stateless/01060_avro.sh +++ b/dbms/tests/queries/0_stateless/01060_avro.sh @@ -29,7 +29,9 @@ cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --ou echo = compression cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' cat $DATA_DIR/simple.deflate.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' -cat $DATA_DIR/simple.snappy.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' + +#snappy is optional +#cat $DATA_DIR/simple.snappy.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' echo = other #no data From 872f759b077304a4a8fc94a23b603eba234629de Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Fri, 10 Jan 2020 16:44:33 -0600 Subject: [PATCH 07/89] Fix tests - timezone --- dbms/tests/queries/0_stateless/01060_avro.reference | 4 ++-- dbms/tests/queries/0_stateless/01060_avro.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/tests/queries/0_stateless/01060_avro.reference b/dbms/tests/queries/0_stateless/01060_avro.reference index a304a2b1f75..f8b3434177d 100644 --- a/dbms/tests/queries/0_stateless/01060_avro.reference +++ b/dbms/tests/queries/0_stateless/01060_avro.reference @@ -14,7 +14,7 @@ "79cd909892d7e7ade1987cc7422628ba" "79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000" +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" 18250,1578641516227,1578641516227000 = compression 1000 @@ -29,7 +29,7 @@ not found = complex "A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000" +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" = other 0 1000 diff --git a/dbms/tests/queries/0_stateless/01060_avro.sh b/dbms/tests/queries/0_stateless/01060_avro.sh index 1d37db93f6a..c92cba188d7 100755 --- a/dbms/tests/queries/0_stateless/01060_avro.sh +++ b/dbms/tests/queries/0_stateless/01060_avro.sh @@ -21,7 +21,7 @@ cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-f cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table' echo = logical_types -cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Date, b_timestamp_millis DateTime64, c_timestamp_micros DateTime64(6)' -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -q 'select * from table' cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64' -q 'select * from table' @@ -58,8 +58,8 @@ S2="a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_stri echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table' echo = logical_types -S3="a_date Date, b_timestamp_millis DateTime64, c_timestamp_micros DateTime64(6)" -echo '"2019-12-20","2020-01-10 01:31:56.227","2020-01-10 01:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' +S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" +echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' echo = other S4="a Int64" From 4c13317fba17c02851b1610bc7915807094e681b Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Fri, 10 Jan 2020 16:46:48 -0600 Subject: [PATCH 08/89] refactor avro cmake --- cmake/find/avro.cmake | 2 +- cmake/find/poco.cmake | 14 +++++++++++++- dbms/CMakeLists.txt | 4 ++++ dbms/src/Core/config_core.h.in | 1 + .../Processors/Formats/Impl/AvroRowInputFormat.cpp | 6 +++++- .../Processors/Formats/Impl/AvroRowInputFormat.h | 3 +++ 6 files changed, 27 insertions(+), 3 deletions(-) diff --git a/cmake/find/avro.cmake b/cmake/find/avro.cmake index 39ad2e31e54..cdb3fc84d3d 100644 --- a/cmake/find/avro.cmake +++ b/cmake/find/avro.cmake @@ -2,7 +2,7 @@ option (ENABLE_AVRO "Enable Avro" ${ENABLE_LIBRARIES}) if (ENABLE_AVRO) -option (USE_INTERNAL_AVRO_LIBRARY "Set to FALSE to use system avro library instead of bundled" ON) +option (USE_INTERNAL_AVRO_LIBRARY "Set to FALSE to use system avro library instead of bundled" ${NOT_UNBUNDLED}) if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/avro/lang/c++/CMakeLists.txt") if(USE_INTERNAL_AVRO_LIBRARY) diff --git a/cmake/find/poco.cmake b/cmake/find/poco.cmake index b44d2932276..0c676d374f1 100644 --- a/cmake/find/poco.cmake +++ b/cmake/find/poco.cmake @@ -14,6 +14,7 @@ if (NOT ENABLE_LIBRARIES) set (ENABLE_POCO_REDIS ${ENABLE_LIBRARIES} CACHE BOOL "") set (ENABLE_POCO_ODBC ${ENABLE_LIBRARIES} CACHE BOOL "") set (ENABLE_POCO_SQL ${ENABLE_LIBRARIES} CACHE BOOL "") + set (ENABLE_POCO_JSON ${ENABLE_LIBRARIES} CACHE BOOL "") endif () set (POCO_COMPONENTS Net XML SQL Data) @@ -34,6 +35,9 @@ if (NOT DEFINED ENABLE_POCO_ODBC OR ENABLE_POCO_ODBC) list (APPEND POCO_COMPONENTS DataODBC) list (APPEND POCO_COMPONENTS SQLODBC) endif () +if (NOT DEFINED ENABLE_POCO_JSON OR ENABLE_POCO_JSON) + list (APPEND POCO_COMPONENTS JSON) +endif () if (NOT USE_INTERNAL_POCO_LIBRARY) find_package (Poco COMPONENTS ${POCO_COMPONENTS}) @@ -112,6 +116,11 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY) endif () endif () + if (NOT DEFINED ENABLE_POCO_JSON OR ENABLE_POCO_JSON) + set (Poco_JSON_LIBRARY PocoJSON) + set (Poco_JSON_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/poco/JSON/include/") + endif () + if (OPENSSL_FOUND AND (NOT DEFINED ENABLE_POCO_NETSSL OR ENABLE_POCO_NETSSL)) set (Poco_NetSSL_LIBRARY PocoNetSSL ${OPENSSL_LIBRARIES}) set (Poco_Crypto_LIBRARY PocoCrypto ${OPENSSL_LIBRARIES}) @@ -145,8 +154,11 @@ endif () if (Poco_SQLODBC_LIBRARY AND ODBC_FOUND) set (USE_POCO_SQLODBC 1) endif () +if (Poco_JSON_LIBRARY) + set (USE_POCO_JSON 1) +endif () -message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY},${Poco_Redis_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") +message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY},${Poco_Redis_LIBRARY},${Poco_JSON_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}, JSON=${USE_POCO_JSON}") # How to make sutable poco: # use branch: diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 4f9ca404cdf..07677309380 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -465,6 +465,10 @@ if (USE_POCO_NETSSL) dbms_target_link_libraries (PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY}) endif() +if (USE_POCO_JSON) + dbms_target_link_libraries (PRIVATE ${Poco_JSON_LIBRARY}) +endif() + dbms_target_link_libraries (PRIVATE ${Poco_Foundation_LIBRARY}) if (USE_ICU) diff --git a/dbms/src/Core/config_core.h.in b/dbms/src/Core/config_core.h.in index fdbd69decd3..2365340cf33 100644 --- a/dbms/src/Core/config_core.h.in +++ b/dbms/src/Core/config_core.h.in @@ -10,5 +10,6 @@ #cmakedefine01 USE_POCO_DATAODBC #cmakedefine01 USE_POCO_MONGODB #cmakedefine01 USE_POCO_REDIS +#cmakedefine01 USE_POCO_JSON #cmakedefine01 USE_INTERNAL_LLVM_LIBRARY #cmakedefine01 USE_SSL diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a0eba94bfdb..9e63c76f4cb 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -484,7 +484,7 @@ bool AvroRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) return false; } - +#ifdef USE_POCO_JSON class AvroConfluentRowInputFormat::SchemaRegistry { public: @@ -603,6 +603,7 @@ AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId } return it->second; } +#endif void registerInputFormatProcessorAvro(FormatFactory & factory) { @@ -615,6 +616,7 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) return std::make_shared(sample, buf, params); }); +#ifdef USE_POCO_JSON factory.registerInputFormatProcessor("AvroConfluent",[=]( ReadBuffer & buf, const Block & sample, @@ -623,6 +625,8 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) { return std::make_shared(sample, buf, params, settings); }); +#endif + } } diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h index ef5e01973dd..1941fb14300 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -46,6 +46,7 @@ private: AvroDeserializer deserializer; }; +#ifdef USE_POCO_JSON class AvroConfluentRowInputFormat : public IRowInputFormat { public: @@ -66,5 +67,7 @@ private: avro::InputStreamPtr input_stream; avro::DecoderPtr decoder; }; +#endif + } #endif From bfc610275d00d04a87f1f6b6ca05e44efa97a93a Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Fri, 10 Jan 2020 22:26:12 -0600 Subject: [PATCH 09/89] fix bad cast --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 9e63c76f4cb..6f2faaf09f0 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -293,7 +293,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node } if (target.isEnum()) { - const auto & enum_type = assert_cast(*target_type); + const auto & enum_type = dynamic_cast(*target_type); std::vector symbol_mapping; for (size_t i = 0; i < root_node->names(); i++) { From 9da0df4f0397f3e10156dbcb0a08dc8516be17c8 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Sat, 11 Jan 2020 01:01:20 -0600 Subject: [PATCH 10/89] Add codec and sync interval settings --- dbms/src/Core/Settings.h | 2 ++ dbms/src/Formats/FormatFactory.cpp | 2 ++ dbms/src/Formats/FormatSettings.h | 6 +++- .../Formats/Impl/AvroRowOutputFormat.cpp | 32 +++++++++++++++---- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 26684153832..ab192289811 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -198,6 +198,8 @@ struct Settings : public SettingsCollection M(SettingUInt64, output_format_pretty_max_column_pad_width, 250, "Maximum width to pad all values in a column in Pretty formats.", 0) \ M(SettingBool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \ M(SettingUInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \ + M(SettingString, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ + M(SettingUInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ \ M(SettingBool, use_client_time_zone, false, "Use client timezone for interpreting DateTime string values, instead of adopting server timezone.", 0) \ \ diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index ade91c5a391..f812b56aa5d 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -100,6 +100,8 @@ static FormatSettings getOutputFormatSetting(const Settings & settings, const Co format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter; format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter; format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; + format_settings.avro.output_codec = settings.output_format_avro_codec; + format_settings.avro.output_sync_interval = settings.output_format_avro_sync_interval; return format_settings; } diff --git a/dbms/src/Formats/FormatSettings.h b/dbms/src/Formats/FormatSettings.h index 6ca54c12265..cc6f7f4dbb3 100644 --- a/dbms/src/Formats/FormatSettings.h +++ b/dbms/src/Formats/FormatSettings.h @@ -114,7 +114,11 @@ struct FormatSettings struct Avro { String schema_registry_url; - } avro; + String output_codec; + UInt64 output_sync_interval = 16 * 1024; + }; + + Avro avro; }; diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 603eb05528d..9f6233303ad 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -42,12 +42,6 @@ #include #include -#define DEFAULT_SYNC_INTERVAL 16*1024 -#ifdef SNAPPY_CODEC_AVAILABLE -#define DEFAULT_CODEC avro::Codec::SNAPPY_CODEC -#else -#define DEFAULT_CODEC avro::Codec::DEFLATE_CODEC -#endif namespace DB { @@ -294,12 +288,36 @@ void AvroSerializer::serializeRow(const Columns & columns, size_t row_num, avro: } } +static avro::Codec getCodec(const std::string& codec_name) +{ + if (codec_name == "") + { +#ifdef SNAPPY_CODEC_AVAILABLE + return avro::Codec::SNAPPY_CODEC; +#else + return avro::Codec::DEFLATE_CODEC; +#endif + } + + if (codec_name == "null") return avro::Codec::NULL_CODEC; + if (codec_name == "deflate") return avro::Codec::DEFLATE_CODEC; +#ifdef SNAPPY_CODEC_AVAILABLE + if (codec_name == "snappy") return avro::Codec::SNAPPY_CODEC; +#endif + + throw Exception("Avro codec " + codec_name + " is not available", ErrorCodes::BAD_ARGUMENTS); +} + AvroRowOutputFormat::AvroRowOutputFormat( WriteBuffer & out_, const Block & header_, FormatFactory::WriteCallback callback, const FormatSettings & settings_) : IRowOutputFormat(header_, out_, callback) , settings(settings_) , serializer(header_.getColumnsWithTypeAndName()) - , file_writer(std::make_unique(out_), serializer.getSchema(), DEFAULT_SYNC_INTERVAL, DEFAULT_CODEC) + , file_writer( + std::make_unique(out_), + serializer.getSchema(), + settings.avro.output_sync_interval, + getCodec(settings.avro.output_codec)) { } From 8346fb59864817bcba0d96b042f870facb1ff36a Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 18 Jan 2020 21:05:54 +0300 Subject: [PATCH 11/89] Update .gitmodules --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 8147eb31799..206ae1e186e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -46,7 +46,7 @@ url = https://github.com/ClickHouse-Extras/protobuf.git [submodule "contrib/boost"] path = contrib/boost - url = https://github.com/oandrew/clickhouse-boost + url = https://github.com/ClickHouse-Extras/boost.git [submodule "contrib/base64"] path = contrib/base64 url = https://github.com/aklomp/base64.git From 997ea7ed53a7ee6a14647b97ea8ab66ed310343c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 22:29:53 +0300 Subject: [PATCH 12/89] Fixed error --- dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h | 2 +- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 4 ++-- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 61fd28f2a70..47240db8b0d 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -309,7 +309,7 @@ protected: /// Uses a DFA based approach in order to better handle patterns without /// time assertions. /// - /// NOTE: This implementation relies on the assumption that the pattern are *small*. + /// NOTE: This implementation relies on the assumption that the pattern is *small*. /// /// This algorithm performs in O(mn) (with m the number of DFA states and N the number /// of events) with a memory consumption and memory allocations in O(m). It means that diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 6f2faaf09f0..59cdbcb8650 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -484,7 +484,7 @@ bool AvroRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) return false; } -#ifdef USE_POCO_JSON +#if USE_POCO_JSON class AvroConfluentRowInputFormat::SchemaRegistry { public: @@ -616,7 +616,7 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) return std::make_shared(sample, buf, params); }); -#ifdef USE_POCO_JSON +#if USE_POCO_JSON factory.registerInputFormatProcessor("AvroConfluent",[=]( ReadBuffer & buf, const Block & sample, diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h index 1941fb14300..7a9f9b239c4 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -1,5 +1,6 @@ #pragma once #include "config_formats.h" +#include "config_core.h" #if USE_AVRO #include @@ -46,7 +47,7 @@ private: AvroDeserializer deserializer; }; -#ifdef USE_POCO_JSON +#if USE_POCO_JSON class AvroConfluentRowInputFormat : public IRowInputFormat { public: From e6b407a961b1cdd81bf276ce806c8f34b246aa58 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 22:34:09 +0300 Subject: [PATCH 13/89] Added TODO --- .../Processors/Formats/Impl/AvroRowInputFormat.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 59cdbcb8650..0332cd4e354 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -498,18 +498,20 @@ public: { base_url = base_url_; } - catch (Poco::SyntaxException & e) + catch (const Poco::SyntaxException & e) { throw Exception("Invalid Schema Registry URL: " + e.displayText(), ErrorCodes::BAD_ARGUMENTS); } } - avro::ValidSchema getSchema(uint32_t id) + avro::ValidSchema getSchema(uint32_t id) const { try { try { + /// TODO Host checking to prevent SSRF + Poco::URI url(base_url, "/schemas/ids/" + std::to_string(id)); Poco::Net::HTTPClientSession session(url.getHost(), url.getPort()); Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery()); @@ -518,16 +520,16 @@ public: auto & response_body = session.receiveResponse(response); if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK) { - throw Exception("http code " + std::to_string(response.getStatus()), ErrorCodes::INCORRECT_DATA); + throw Exception("HTTP code " + std::to_string(response.getStatus()), ErrorCodes::INCORRECT_DATA); } Poco::JSON::Parser parser; auto json_body = parser.parse(response_body).extract(); auto schema = json_body->getValue("schema"); return avro::compileJsonSchemaFromString(schema); } - catch (const Exception & e) + catch (const Exception &) { - throw e; + throw; } catch (const Poco::Exception & e) { @@ -540,7 +542,7 @@ public: } catch (Exception & e) { - e.addMessage("while fetching schema id=" + std::to_string(id)); + e.addMessage("while fetching schema id = " + std::to_string(id)); throw; } } From 85f2e86e5bcde32f7c7e7aa71389f3277893ceaf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:10:11 +0300 Subject: [PATCH 14/89] Added performance test for Avro format --- dbms/tests/performance/parse_engine_file.xml | 1 + dbms/tests/performance/select_format.xml | 1 + 2 files changed, 2 insertions(+) diff --git a/dbms/tests/performance/parse_engine_file.xml b/dbms/tests/performance/parse_engine_file.xml index 080acbd53f2..8a0054bdd7f 100644 --- a/dbms/tests/performance/parse_engine_file.xml +++ b/dbms/tests/performance/parse_engine_file.xml @@ -34,6 +34,7 @@ TSKV RowBinary Native + Avro diff --git a/dbms/tests/performance/select_format.xml b/dbms/tests/performance/select_format.xml index 621247fee1e..189b35a2700 100644 --- a/dbms/tests/performance/select_format.xml +++ b/dbms/tests/performance/select_format.xml @@ -44,6 +44,7 @@ XML ODBCDriver2 MySQLWire + Avro From f0b4fcee1f3eca214365a96b0aa685a58f1f996e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:10:32 +0300 Subject: [PATCH 15/89] Minor modifications --- dbms/src/IO/ReadHelpers.h | 18 ++++++++++++++++++ .../Formats/Impl/AvroRowInputFormat.cpp | 13 ++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/dbms/src/IO/ReadHelpers.h b/dbms/src/IO/ReadHelpers.h index fc8e444330c..9cb26434930 100644 --- a/dbms/src/IO/ReadHelpers.h +++ b/dbms/src/IO/ReadHelpers.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -746,6 +747,23 @@ inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf) inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); } +template +inline std::enable_if_t && (sizeof(T) <= 8), void> +readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian architecture. +{ + readPODBinary(x, buf); + + if constexpr (sizeof(x) == 1) + return; + else if constexpr (sizeof(x) == 2) + x = bswap_16(x); + else if constexpr (sizeof(x) == 4) + x = bswap_32(x); + else if constexpr (sizeof(x) == 8) + x = bswap_64(x); +} + + /// Generic methods to read value in text tab-separated format. template inline std::enable_if_t, void> diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 0332cd4e354..25cdc0e9abc 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -553,16 +553,16 @@ private: static uint32_t readConfluentSchemaId(ReadBuffer & in) { - Poco::Buffer buf(5); - in.readStrict(buf.begin(), buf.capacity()); - Poco::MemoryBinaryReader binary_reader(buf, Poco::BinaryReader::BIG_ENDIAN_BYTE_ORDER); - uint8_t magic; uint32_t schema_id; - binary_reader >> magic >> schema_id; + + readBinaryBigEndian(magic, in); + readBinaryBigEndian(schema_id, in); + if (magic != 0x00) { - throw Exception("Invalid magic byte", ErrorCodes::INCORRECT_DATA); + throw Exception("Invalid magic byte before AvroConfluent schema identifier." + " Must be zero byte, found " + std::to_string(int(magic)) + " instead", ErrorCodes::INCORRECT_DATA); } return schema_id; @@ -577,7 +577,6 @@ AvroConfluentRowInputFormat::AvroConfluentRowInputFormat( , decoder(avro::binaryDecoder()) { - (void)format_settings_; decoder->init(*input_stream); } From ac46498f60b2ff54a390e3f0615f7915eb4b4faf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:12:58 +0300 Subject: [PATCH 16/89] Minor modification --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 4 ++-- dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 25cdc0e9abc..75f5634578e 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -608,7 +608,7 @@ AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId void registerInputFormatProcessorAvro(FormatFactory & factory) { - factory.registerInputFormatProcessor("Avro", [=]( + factory.registerInputFormatProcessor("Avro", []( ReadBuffer & buf, const Block & sample, const RowInputFormatParams & params, @@ -618,7 +618,7 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) }); #if USE_POCO_JSON - factory.registerInputFormatProcessor("AvroConfluent",[=]( + factory.registerInputFormatProcessor("AvroConfluent",[]( ReadBuffer & buf, const Block & sample, const RowInputFormatParams & params, diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 9f6233303ad..233d9284fc7 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -342,7 +342,7 @@ void AvroRowOutputFormat::writeSuffix() void registerOutputFormatProcessorAvro(FormatFactory & factory) { - factory.registerOutputFormatProcessor("Avro",[=]( + factory.registerOutputFormatProcessor("Avro", []( WriteBuffer & buf, const Block & sample, FormatFactory::WriteCallback callback, From 216b39a30de3a65104c9608712ba81b9f618b293 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:15:49 +0300 Subject: [PATCH 17/89] Minor changes for consistency --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 7 +------ dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h | 4 ++-- dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp | 2 -- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 75f5634578e..8d96afc80dc 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -8,12 +8,9 @@ #include #include - #include - #include - #include #include #include @@ -27,14 +24,12 @@ #include #include - #include #include #include #include #include - #include #include #include @@ -414,7 +409,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) } -AvroDeserializer::AvroDeserializer(const DB::ColumnsWithTypeAndName & columns, avro::ValidSchema schema) +AvroDeserializer::AvroDeserializer(const ColumnsWithTypeAndName & columns, avro::ValidSchema schema) { auto schema_root = schema.root(); if (schema_root->type() != avro::AVRO_RECORD) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h index 7a9f9b239c4..353f611a36e 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -21,7 +21,7 @@ namespace DB class AvroDeserializer { public: - AvroDeserializer(const DB::ColumnsWithTypeAndName & columns, avro::ValidSchema schema); + AvroDeserializer(const ColumnsWithTypeAndName & columns, avro::ValidSchema schema); void deserializeRow(MutableColumns & columns, avro::Decoder & decoder); private: @@ -56,7 +56,7 @@ public: String getName() const override { return "AvroConfluentRowInputFormat"; } private: - const DB::ColumnsWithTypeAndName header_columns; + const ColumnsWithTypeAndName header_columns; class SchemaRegistry; std::unique_ptr schema_registry; diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 233d9284fc7..55fc9d08ca9 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -8,7 +8,6 @@ #include #include - #include #include @@ -19,7 +18,6 @@ #include #include - #include #include #include From 457c50c7d71bbf393480d4a5f0fd6c5c2f768c60 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:19:10 +0300 Subject: [PATCH 18/89] Code cleanups around --- dbms/src/Common/RemoteHostFilter.cpp | 5 +++-- dbms/src/Common/RemoteHostFilter.h | 11 +++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/dbms/src/Common/RemoteHostFilter.cpp b/dbms/src/Common/RemoteHostFilter.cpp index 16aaac35dbe..4c4aa3bca81 100644 --- a/dbms/src/Common/RemoteHostFilter.cpp +++ b/dbms/src/Common/RemoteHostFilter.cpp @@ -1,12 +1,13 @@ #include -#include #include -#include #include +#include +#include #include #include #include + namespace DB { namespace ErrorCodes diff --git a/dbms/src/Common/RemoteHostFilter.h b/dbms/src/Common/RemoteHostFilter.h index 86743891051..48d9b2bda7c 100644 --- a/dbms/src/Common/RemoteHostFilter.h +++ b/dbms/src/Common/RemoteHostFilter.h @@ -1,17 +1,19 @@ #pragma once +#include #include #include -#include -#include +namespace Poco { class URI; } +namespace Poco { namespace Util { class AbstractConfiguration; } } + namespace DB { class RemoteHostFilter { /** - * This class checks if url is allowed. + * This class checks if URL is allowed. * If primary_hosts and regexp_hosts are empty all urls are allowed. */ public: @@ -25,6 +27,7 @@ private: std::unordered_set primary_hosts; /// Allowed primary () URL from config.xml std::vector regexp_hosts; /// Allowed regexp () URL from config.xml - bool checkForDirectEntry(const std::string & str) const; /// Checks if the primary_hosts and regexp_hosts contain str. If primary_hosts and regexp_hosts are empty return true. + /// Checks if the primary_hosts and regexp_hosts contain str. If primary_hosts and regexp_hosts are empty return true. + bool checkForDirectEntry(const std::string & str) const; }; } From 3c7df7ca315c5cb52a7bffe9934c41988b44bbb0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jan 2020 23:42:50 +0300 Subject: [PATCH 19/89] Minor modifications --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 8d96afc80dc..795d69c7505 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -438,7 +438,7 @@ AvroDeserializer::AvroDeserializer(const ColumnsWithTypeAndName & columns, avro: catch (Exception & e) { e.addMessage("column " + column.name); - e.rethrow(); + throw; } field_mapping[field_index] = i; } diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 55fc9d08ca9..f303fea2b28 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -271,7 +271,7 @@ AvroSerializer::AvroSerializer(const ColumnsWithTypeAndName & columns) catch (Exception & e) { e.addMessage("column " + column.name); - e.rethrow(); + throw; } } schema.setSchema(record_schema); From d732f854b73e79c9f23a3ad33448423316c86f92 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 00:42:51 +0300 Subject: [PATCH 20/89] Added timeouts for schema request --- .../Formats/Impl/AvroRowInputFormat.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 795d69c7505..a64ad0cd705 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -5,8 +5,10 @@ #include #include + #include #include +#include #include #include @@ -508,15 +510,23 @@ public: /// TODO Host checking to prevent SSRF Poco::URI url(base_url, "/schemas/ids/" + std::to_string(id)); - Poco::Net::HTTPClientSession session(url.getHost(), url.getPort()); + + /// One second for connect/send/receive. Just in case. + ConnectionTimeouts timeouts({1, 0}, {1, 0}, {1, 0}); + Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery()); - session.sendRequest(request); + + auto session = makePooledHTTPSession(url, timeouts, 1); + session->sendRequest(request); + Poco::Net::HTTPResponse response; - auto & response_body = session.receiveResponse(response); + auto & response_body = session->receiveResponse(response); + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK) { throw Exception("HTTP code " + std::to_string(response.getStatus()), ErrorCodes::INCORRECT_DATA); } + Poco::JSON::Parser parser; auto json_body = parser.parse(response_body).extract(); auto schema = json_body->getValue("schema"); From ae185a24d201d777bcfda0bf1b7288941f44570a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 00:45:16 +0300 Subject: [PATCH 21/89] Disable AvroConfluent --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a64ad0cd705..13797a9ca56 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -623,6 +623,12 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) }); #if USE_POCO_JSON + + /// AvroConfluent format is disabled for the following reasons: + /// 1. There is no test for it. + /// 2. RemoteHostFilter is not used to prevent CSRF attacks. + +#if 0 factory.registerInputFormatProcessor("AvroConfluent",[]( ReadBuffer & buf, const Block & sample, @@ -633,6 +639,8 @@ void registerInputFormatProcessorAvro(FormatFactory & factory) }); #endif +#endif + } } From 82cec8b05a5261ce60de50b00716fd0fe4748682 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Sun, 19 Jan 2020 01:37:12 +0300 Subject: [PATCH 22/89] Refactoring --- dbms/programs/client/Client.cpp | 13 +++- libs/libcommon/CMakeLists.txt | 14 ++++- libs/libcommon/include/common/LineReader.h | 23 ++++--- .../include/common/ReplxxLineReader.h | 18 ++++++ libs/libcommon/src/LineReader.cpp | 52 +--------------- libs/libcommon/src/ReplxxLineReader.cpp | 60 +++++++++++++++++++ utils/zookeeper-cli/zookeeper-cli.cpp | 13 +++- 7 files changed, 124 insertions(+), 69 deletions(-) create mode 100644 libs/libcommon/include/common/ReplxxLineReader.h create mode 100644 libs/libcommon/src/ReplxxLineReader.cpp diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index 1c4902c48f6..077b4d68025 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -2,6 +2,12 @@ #include "ConnectionParameters.h" #include "Suggest.h" +#ifdef USE_REPLXX +# include +#else +# include +#endif + #include #include #include @@ -19,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +501,11 @@ private: if (!history_file.empty() && !Poco::File(history_file).exists()) Poco::File(history_file).createFile(); - LineReader lr(Suggest::instance(), history_file, '\\', config().has("multiline") ? ';' : 0); +#ifdef USE_REPLXX + ReplxxLineReader lr(Suggest::instance(), history_file, '\\', config().has("multiline") ? ';' : 0); +#else + LineReader lr(history_file, '\\', config().has("multiline") ? ';' : 0); +#endif do { diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 3267bbe6ce1..4ecad2e5232 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -10,7 +10,7 @@ if (DEFINED APPLE_HAVE_CLOCK_GETTIME) target_compile_definitions(apple_rt PUBLIC -DAPPLE_HAVE_CLOCK_GETTIME=${APPLE_HAVE_CLOCK_GETTIME}) endif () -add_library (common +set (COMMON_SRCS src/argsToConfig.cpp src/coverage.cpp src/DateLUT.cpp @@ -65,7 +65,19 @@ add_library (common include/ext/scope_guard.h include/ext/size.h include/ext/unlock_guard.h +) +if (ENABLE_READLINE) + set (COMMON_SRCS + src/ReplxxLineReader.cpp + include/common/ReplxxLineReader.h + + ${COMMON_SRCS} + ) +endif () + +add_library (common + ${COMMON_SRCS} ${CONFIG_COMMON}) if (USE_INTERNAL_MEMCPY) diff --git a/libs/libcommon/include/common/LineReader.h b/libs/libcommon/include/common/LineReader.h index 120ff76dac6..c843a2ece35 100644 --- a/libs/libcommon/include/common/LineReader.h +++ b/libs/libcommon/include/common/LineReader.h @@ -22,8 +22,8 @@ public: WordsRange getCompletions(const String & prefix, size_t prefix_length) const; }; - LineReader(const Suggest * suggest, const String & history_file_path, char extender, char delimiter = 0); /// if delimiter != 0, then it's multiline mode - ~LineReader(); + LineReader(const String & history_file_path, char extender, char delimiter = 0); /// if delimiter != 0, then it's multiline mode + virtual ~LineReader() {} /// Reads the whole line until delimiter (in multiline mode) or until the last line without extender. /// If resulting line is empty, it means the user interrupted the input. @@ -31,7 +31,7 @@ public: /// Typical delimiter is ';' (semicolon) and typical extender is '\' (backslash). String readLine(const String & first_prompt, const String & second_prompt); -private: +protected: enum InputStatus { ABORT = 0, @@ -39,19 +39,16 @@ private: INPUT_LINE, }; - String input; - String prev_line; const String history_file_path; + + String input; + +private: const char extender; const char delimiter; - InputStatus readOneLine(const String & prompt); - void addToHistory(const String & line); + String prev_line; - /// Since CMake doesn't impose restrictions on includes between unrelated targets - /// it's possible that we include this file without USE_REPLXX defined. -#ifdef __clang__ - [[maybe_unused]] -#endif - void * impl; + virtual InputStatus readOneLine(const String & prompt); + virtual void addToHistory(const String &) {} }; diff --git a/libs/libcommon/include/common/ReplxxLineReader.h b/libs/libcommon/include/common/ReplxxLineReader.h new file mode 100644 index 00000000000..900b56b8422 --- /dev/null +++ b/libs/libcommon/include/common/ReplxxLineReader.h @@ -0,0 +1,18 @@ +#pragma once + +#include "LineReader.h" + +#include + +class ReplxxLineReader : public LineReader +{ +public: + ReplxxLineReader(const Suggest * suggest, const String & history_file_path, char extender, char delimiter = 0); + ~ReplxxLineReader() override; + +private: + InputStatus readOneLine(const String & prompt) override; + void addToHistory(const String & line) override; + + replxx::Replxx rx; +}; diff --git a/libs/libcommon/src/LineReader.cpp b/libs/libcommon/src/LineReader.cpp index 5e4c853b185..417e9d7467b 100644 --- a/libs/libcommon/src/LineReader.cpp +++ b/libs/libcommon/src/LineReader.cpp @@ -1,9 +1,5 @@ #include -#ifdef USE_REPLXX -# include -#endif - #include #include @@ -43,42 +39,12 @@ LineReader::Suggest::WordsRange LineReader::Suggest::getCompletions(const String }); } -LineReader::LineReader(const Suggest * suggest, const String & history_file_path_, char extender_, char delimiter_) +LineReader::LineReader(const String & history_file_path_, char extender_, char delimiter_) : history_file_path(history_file_path_), extender(extender_), delimiter(delimiter_) { -#ifdef USE_REPLXX - impl = new replxx::Replxx; - auto & rx = *(replxx::Replxx*)(impl); - - if (!history_file_path.empty()) - rx.history_load(history_file_path); - - auto callback = [suggest] (const String & context, size_t context_size) - { - auto range = suggest->getCompletions(context, context_size); - return replxx::Replxx::completions_t(range.first, range.second); - }; - - if (suggest) - { - rx.set_completion_callback(callback); - rx.set_complete_on_empty(false); - rx.set_word_break_characters(" \t\n\r\"\\'`@$><=;|&{(."); - } -#endif /// FIXME: check extender != delimiter } -LineReader::~LineReader() -{ -#ifdef USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - if (!history_file_path.empty()) - rx.history_save(history_file_path); - delete (replxx::Replxx *)impl; -#endif -} - String LineReader::readLine(const String & first_prompt, const String & second_prompt) { String line; @@ -127,27 +93,11 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) { input.clear(); -#ifdef USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - const char* cinput = rx.input(prompt); - if (cinput == nullptr) - return (errno != EAGAIN) ? ABORT : RESET_LINE; - input = cinput; -#else std::cout << prompt; std::getline(std::cin, input); if (!std::cin.good()) return ABORT; -#endif trim(input); return INPUT_LINE; } - -void LineReader::addToHistory(const String & line) -{ -#ifdef USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - rx.history_add(line); -#endif -} diff --git a/libs/libcommon/src/ReplxxLineReader.cpp b/libs/libcommon/src/ReplxxLineReader.cpp new file mode 100644 index 00000000000..1d1f6a6f057 --- /dev/null +++ b/libs/libcommon/src/ReplxxLineReader.cpp @@ -0,0 +1,60 @@ +#include + +#include +#include +#include + +namespace +{ + +/// Trim ending whitespace inplace +void trim(String & s) +{ + s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end()); +} + +} + +ReplxxLineReader::ReplxxLineReader(const Suggest * suggest, const String & history_file_path_, char extender_, char delimiter_) + : LineReader(history_file_path_, extender_, delimiter_) +{ + if (!history_file_path.empty()) + rx.history_load(history_file_path); + + auto callback = [suggest] (const String & context, size_t context_size) + { + auto range = suggest->getCompletions(context, context_size); + return replxx::Replxx::completions_t(range.first, range.second); + }; + + if (suggest) + { + rx.set_completion_callback(callback); + rx.set_complete_on_empty(false); + rx.set_word_break_characters(" \t\n\r\"\\'`@$><=;|&{(."); + } +} + +ReplxxLineReader::~ReplxxLineReader() +{ + if (!history_file_path.empty()) + rx.history_save(history_file_path); +} + +LineReader::InputStatus ReplxxLineReader::readOneLine(const String & prompt) +{ + input.clear(); + + const char* cinput = rx.input(prompt); + if (cinput == nullptr) + return (errno != EAGAIN) ? ABORT : RESET_LINE; + input = cinput; + + trim(input); + return INPUT_LINE; +} + +void ReplxxLineReader::addToHistory(const String & line) +{ + rx.history_add(line); +} diff --git a/utils/zookeeper-cli/zookeeper-cli.cpp b/utils/zookeeper-cli/zookeeper-cli.cpp index 5e36ffecdaa..c3434987bd6 100644 --- a/utils/zookeeper-cli/zookeeper-cli.cpp +++ b/utils/zookeeper-cli/zookeeper-cli.cpp @@ -4,10 +4,15 @@ #include #include #include -#include #include #include +#ifdef USE_REPLXX +# include +#else +# include +#endif + void printStat(const Coordination::Stat & s) { @@ -69,7 +74,11 @@ int main(int argc, char ** argv) Logger::root().setLevel("trace"); zkutil::ZooKeeper zk(argv[1]); - LineReader lr(nullptr, {}, '\\'); +#ifdef USE_REPLXX + ReplxxLineReader lr(nullptr, {}, '\\'); +#else + LineReader lr({}, '\\'); +#endif do { From 4fdfb02b2f87814d3669f5fb478a202c90a55592 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 03:43:59 +0300 Subject: [PATCH 23/89] Fixed build --- dbms/src/IO/ReadHelpers.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/ReadHelpers.h b/dbms/src/IO/ReadHelpers.h index 9cb26434930..2e6e51a7835 100644 --- a/dbms/src/IO/ReadHelpers.h +++ b/dbms/src/IO/ReadHelpers.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -756,11 +755,11 @@ readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian archi if constexpr (sizeof(x) == 1) return; else if constexpr (sizeof(x) == 2) - x = bswap_16(x); + x = __builtin_bswap16(x); else if constexpr (sizeof(x) == 4) - x = bswap_32(x); + x = __builtin_bswap32(x); else if constexpr (sizeof(x) == 8) - x = bswap_64(x); + x = __builtin_bswap64(x); } From b5ff5341d5b8154c294315b2a8341088e050bc24 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 04:22:27 +0300 Subject: [PATCH 24/89] Support more types for output --- .../Formats/Impl/AvroRowOutputFormat.cpp | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index f303fea2b28..b0375c7e2ae 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -83,15 +83,41 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF switch (data_type->getTypeId()) { case TypeIndex::UInt8: - return {avro::BoolSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { - encoder.encodeBool(assert_cast(column).getElement(row_num)); + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Int8: + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::UInt16: + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::Int16: + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeInt(assert_cast(column).getElement(row_num)); + }}; + case TypeIndex::UInt32: [[fallthrough]]; + case TypeIndex::DateTime: + return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeInt(assert_cast(column).getElement(row_num)); }}; case TypeIndex::Int32: return {avro::IntSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { encoder.encodeInt(assert_cast(column).getElement(row_num)); }}; + case TypeIndex::UInt64: + return {avro::LongSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + encoder.encodeLong(assert_cast(column).getElement(row_num)); + }}; case TypeIndex::Int64: return {avro::LongSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { @@ -136,7 +162,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF }}; } case TypeIndex::String: - return {avro::StringSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + return {avro::BytesSchema(), [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const StringRef & s = assert_cast(column).getDataAt(row_num); encoder.encodeBytes(reinterpret_cast(s.data), s.size); From 40e35c36e7849ca2bbd9036e75f5c466cb18833b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 20:02:29 +0300 Subject: [PATCH 25/89] Minor modifications --- .../Formats/Impl/AvroRowInputFormat.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 13797a9ca56..649ed777c4f 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -64,12 +64,9 @@ namespace DB { namespace ErrorCodes { - extern const int BAD_TYPE_OF_FIELD; extern const int BAD_ARGUMENTS; extern const int THERE_IS_NO_COLUMN; - extern const int LOGICAL_ERROR; extern const int INCORRECT_DATA; - extern const int CANNOT_READ_ALL_DATA; extern const int ILLEGAL_COLUMN; extern const int TYPE_MISMATCH; } @@ -114,18 +111,9 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node WhichDataType target(target_type); switch (root_node->type()) { - case avro::AVRO_STRING: - if (target.isString()) - { - return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable - { - decoder.decodeString(tmp); - column.insertData(tmp.c_str(), tmp.length()); - }; - } - break; + case avro::AVRO_STRING: [[fallthrough]]; case avro::AVRO_BYTES: - if (target.isString()) + if (target.isString() || target.isFixedString()) { return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable { From 05048bf0c1869b959db60d0d243fbaa654471355 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 Jan 2020 20:08:15 +0300 Subject: [PATCH 26/89] Fixed "unbundled" build --- cmake/find/replxx.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/find/replxx.cmake b/cmake/find/replxx.cmake index 13df104515e..3a0e5917b04 100644 --- a/cmake/find/replxx.cmake +++ b/cmake/find/replxx.cmake @@ -1,4 +1,4 @@ -option (ENABLE_REPLXX "Enable replxx support" ${ENABLE_LIBRARIES}) +option (ENABLE_REPLXX "Enable replxx support" ${NOT_UNBUNDLED}) if (ENABLE_REPLXX) option (USE_INTERNAL_REPLXX "Use internal replxx library" ${NOT_UNBUNDLED}) From 2e13f63de845dc379bbe6764a848c1406f5d529f Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 20 Jan 2020 17:34:27 +0300 Subject: [PATCH 27/89] Refactor configuration --- CMakeLists.txt | 1 - cmake/find/replxx.cmake | 40 ----------- contrib/CMakeLists.txt | 4 +- contrib/replxx-cmake/CMakeLists.txt | 66 ++++++++++++++----- dbms/programs/client/CMakeLists.txt | 2 +- dbms/programs/client/Client.cpp | 4 +- ...StorageSystemBuildOptions.generated.cpp.in | 1 - libs/libcommon/CMakeLists.txt | 6 +- .../include/common/ReplxxLineReader.h | 2 +- .../include/common/config_common.h.in | 1 - libs/libcommon/src/ReplxxLineReader.cpp | 6 +- utils/zookeeper-cli/CMakeLists.txt | 2 +- utils/zookeeper-cli/zookeeper-cli.cpp | 4 +- 13 files changed, 64 insertions(+), 75 deletions(-) delete mode 100644 cmake/find/replxx.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index d37cdfc3af8..7c8ccb6e17c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -352,7 +352,6 @@ include (cmake/find/simdjson.cmake) include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/orc.cmake) -include (cmake/find/replxx.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) diff --git a/cmake/find/replxx.cmake b/cmake/find/replxx.cmake deleted file mode 100644 index 3a0e5917b04..00000000000 --- a/cmake/find/replxx.cmake +++ /dev/null @@ -1,40 +0,0 @@ -option (ENABLE_REPLXX "Enable replxx support" ${NOT_UNBUNDLED}) - -if (ENABLE_REPLXX) - option (USE_INTERNAL_REPLXX "Use internal replxx library" ${NOT_UNBUNDLED}) - - if (USE_INTERNAL_REPLXX AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/replxx/README.md") - message (WARNING "submodule contrib/replxx is missing. to fix try run: \n git submodule update --init --recursive") - set (USE_INTERNAL_REPLXX 0) - endif () - - if (NOT USE_INTERNAL_REPLXX) - find_library(LIBRARY_REPLXX NAMES replxx replxx-static) - find_path(INCLUDE_REPLXX replxx.hxx) - - add_library(replxx UNKNOWN IMPORTED) - set_property(TARGET replxx PROPERTY IMPORTED_LOCATION ${LIBRARY_REPLXX}) - target_include_directories(replxx PUBLIC ${INCLUDE_REPLXX}) - - set(CMAKE_REQUIRED_LIBRARIES replxx) - check_cxx_source_compiles( - " - #include - int main() { - replxx::Replxx rx; - } - " - EXTERNAL_REPLXX_WORKS - ) - - if (NOT EXTERNAL_REPLXX_WORKS) - message (FATAL_ERROR "replxx is unusable: ${LIBRARY_REPLXX} ${INCLUDE_REPLXX}") - endif () - endif () - - set(USE_REPLXX 1) - - message (STATUS "Using replxx") -else () - set(USE_REPLXX 0) -endif () diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index f81d616cddd..89f12ce0b70 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -332,6 +332,4 @@ if (USE_FASTOPS) add_subdirectory (fastops-cmake) endif() -if (USE_INTERNAL_REPLXX) - add_subdirectory (replxx-cmake) -endif() +add_subdirectory(replxx-cmake) diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 1b27fd53070..c2dfe3ac823 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -1,18 +1,52 @@ -set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/replxx") +option (ENABLE_REPLXX "Enable replxx support" ${ENABLE_LIBRARIES}) -set(SRCS - ${LIBRARY_DIR}/src/conversion.cxx - ${LIBRARY_DIR}/src/escape.cxx - ${LIBRARY_DIR}/src/history.cxx - ${LIBRARY_DIR}/src/io.cxx - ${LIBRARY_DIR}/src/prompt.cxx - ${LIBRARY_DIR}/src/replxx.cxx - ${LIBRARY_DIR}/src/replxx_impl.cxx - ${LIBRARY_DIR}/src/util.cxx - ${LIBRARY_DIR}/src/wcwidth.cpp - ${LIBRARY_DIR}/src/ConvertUTF.cpp -) +if (ENABLE_REPLXX) + option (USE_INTERNAL_REPLXX "Use internal replxx library" ${NOT_UNBUNDLED}) -add_library(replxx ${SRCS}) -target_include_directories(replxx PUBLIC ${LIBRARY_DIR}/include) -target_compile_options(replxx PUBLIC -Wno-documentation) + if (USE_INTERNAL_REPLXX) + set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/replxx") + + set(SRCS + ${LIBRARY_DIR}/src/conversion.cxx + ${LIBRARY_DIR}/src/ConvertUTF.cpp + ${LIBRARY_DIR}/src/escape.cxx + ${LIBRARY_DIR}/src/history.cxx + ${LIBRARY_DIR}/src/io.cxx + ${LIBRARY_DIR}/src/prompt.cxx + ${LIBRARY_DIR}/src/replxx_impl.cxx + ${LIBRARY_DIR}/src/replxx.cxx + ${LIBRARY_DIR}/src/util.cxx + ${LIBRARY_DIR}/src/wcwidth.cpp + ) + + add_library (replxx ${SRCS}) + target_include_directories(replxx PUBLIC ${LIBRARY_DIR}/include) + else () + find_library(LIBRARY_REPLXX NAMES replxx replxx-static) + find_path(INCLUDE_REPLXX replxx.hxx) + + add_library(replxx UNKNOWN IMPORTED) + set_property(TARGET replxx PROPERTY IMPORTED_LOCATION ${LIBRARY_REPLXX}) + target_include_directories(replxx PUBLIC ${INCLUDE_REPLXX}) + + set(CMAKE_REQUIRED_LIBRARIES replxx) + check_cxx_source_compiles( + " + #include + int main() { + replxx::Replxx rx; + } + " + EXTERNAL_REPLXX_WORKS + ) + + if (NOT EXTERNAL_REPLXX_WORKS) + message (FATAL_ERROR "replxx is unusable: ${LIBRARY_REPLXX} ${INCLUDE_REPLXX}") + endif () + endif () + + target_compile_options(replxx PUBLIC -Wno-documentation) + target_compile_definitions(replxx PUBLIC USE_REPLXX=1) + + message (STATUS "Using replxx") +endif () diff --git a/dbms/programs/client/CMakeLists.txt b/dbms/programs/client/CMakeLists.txt index d4c157ac3b0..11ade559a8d 100644 --- a/dbms/programs/client/CMakeLists.txt +++ b/dbms/programs/client/CMakeLists.txt @@ -4,7 +4,7 @@ set(CLICKHOUSE_CLIENT_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Suggest.cpp ) -set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io clickhouse_parsers string_utils ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY}) +set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io clickhouse_parsers string_utils ${Boost_PROGRAM_OPTIONS_LIBRARY}) include(CheckSymbolExists) check_symbol_exists(readpassphrase readpassphrase.h HAVE_READPASSPHRASE) diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index 76b714003fa..49ca3255f35 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -2,7 +2,7 @@ #include "ConnectionParameters.h" #include "Suggest.h" -#ifdef USE_REPLXX +#if USE_REPLXX # include #else # include @@ -501,7 +501,7 @@ private: if (!history_file.empty() && !Poco::File(history_file).exists()) Poco::File(history_file).createFile(); -#ifdef USE_REPLXX +#if USE_REPLXX ReplxxLineReader lr(Suggest::instance(), history_file, '\\', config().has("multiline") ? ';' : 0); #else LineReader lr(history_file, '\\', config().has("multiline") ? ';' : 0); diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 550ead28996..65c4f19b7cb 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -61,7 +61,6 @@ const char * auto_config_build[] "USE_HYPERSCAN", "@USE_HYPERSCAN@", "USE_SIMDJSON", "@USE_SIMDJSON@", "USE_POCO_REDIS", "@USE_POCO_REDIS@", - "USE_REPLXX", "@USE_REPLXX@", nullptr, nullptr }; diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index a2e70e673b7..b83c876978e 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -67,7 +67,7 @@ set (COMMON_SRCS include/ext/unlock_guard.h ) -if (ENABLE_READLINE) +if (ENABLE_REPLXX) set (COMMON_SRCS src/ReplxxLineReader.cpp include/common/ReplxxLineReader.h @@ -104,8 +104,8 @@ if(CCTZ_LIBRARY) target_link_libraries(common PRIVATE ${CCTZ_LIBRARY}) endif() -if (USE_REPLXX) - target_link_libraries(common PRIVATE replxx) +if (ENABLE_REPLXX) + target_link_libraries(common PUBLIC replxx) endif () target_link_libraries (common diff --git a/libs/libcommon/include/common/ReplxxLineReader.h b/libs/libcommon/include/common/ReplxxLineReader.h index 900b56b8422..47eabbf9330 100644 --- a/libs/libcommon/include/common/ReplxxLineReader.h +++ b/libs/libcommon/include/common/ReplxxLineReader.h @@ -7,7 +7,7 @@ class ReplxxLineReader : public LineReader { public: - ReplxxLineReader(const Suggest * suggest, const String & history_file_path, char extender, char delimiter = 0); + ReplxxLineReader(const Suggest & suggest, const String & history_file_path, char extender, char delimiter = 0); ~ReplxxLineReader() override; private: diff --git a/libs/libcommon/include/common/config_common.h.in b/libs/libcommon/include/common/config_common.h.in index 6cee84a5b32..41999bb5cde 100644 --- a/libs/libcommon/include/common/config_common.h.in +++ b/libs/libcommon/include/common/config_common.h.in @@ -3,6 +3,5 @@ // .h autogenerated by cmake ! #cmakedefine01 USE_JEMALLOC -#cmakedefine01 USE_REPLXX #cmakedefine01 UNBUNDLED #cmakedefine01 WITH_COVERAGE diff --git a/libs/libcommon/src/ReplxxLineReader.cpp b/libs/libcommon/src/ReplxxLineReader.cpp index 67eb81e127d..044ea05413d 100644 --- a/libs/libcommon/src/ReplxxLineReader.cpp +++ b/libs/libcommon/src/ReplxxLineReader.cpp @@ -15,15 +15,15 @@ void trim(String & s) } -ReplxxLineReader::ReplxxLineReader(const Suggest * suggest, const String & history_file_path_, char extender_, char delimiter_) +ReplxxLineReader::ReplxxLineReader(const Suggest & suggest, const String & history_file_path_, char extender_, char delimiter_) : LineReader(history_file_path_, extender_, delimiter_) { if (!history_file_path.empty()) rx.history_load(history_file_path); - auto callback = [suggest] (const String & context, size_t context_size) + auto callback = [&suggest] (const String & context, size_t context_size) { - auto range = suggest->getCompletions(context, context_size); + auto range = suggest.getCompletions(context, context_size); return replxx::Replxx::completions_t(range.first, range.second); }; diff --git a/utils/zookeeper-cli/CMakeLists.txt b/utils/zookeeper-cli/CMakeLists.txt index 7c14ed605fb..7e67f078586 100644 --- a/utils/zookeeper-cli/CMakeLists.txt +++ b/utils/zookeeper-cli/CMakeLists.txt @@ -1,3 +1,3 @@ add_executable(clickhouse-zookeeper-cli zookeeper-cli.cpp) -target_link_libraries(clickhouse-zookeeper-cli PRIVATE clickhouse_common_zookeeper ${Poco_Foundation_LIBRARY} ${LINE_EDITING_LIBS}) +target_link_libraries(clickhouse-zookeeper-cli PRIVATE clickhouse_common_zookeeper ${Poco_Foundation_LIBRARY}) INSTALL(TARGETS clickhouse-zookeeper-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse-utils) diff --git a/utils/zookeeper-cli/zookeeper-cli.cpp b/utils/zookeeper-cli/zookeeper-cli.cpp index c3434987bd6..ba20f462735 100644 --- a/utils/zookeeper-cli/zookeeper-cli.cpp +++ b/utils/zookeeper-cli/zookeeper-cli.cpp @@ -7,7 +7,7 @@ #include #include -#ifdef USE_REPLXX +#if USE_REPLXX # include #else # include @@ -74,7 +74,7 @@ int main(int argc, char ** argv) Logger::root().setLevel("trace"); zkutil::ZooKeeper zk(argv[1]); -#ifdef USE_REPLXX +#if USE_REPLXX ReplxxLineReader lr(nullptr, {}, '\\'); #else LineReader lr({}, '\\'); From f737f19d254096541cc58a9039d72e3d64c168a6 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 20 Jan 2020 19:23:37 +0300 Subject: [PATCH 28/89] Fix build --- utils/zookeeper-cli/zookeeper-cli.cpp | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/utils/zookeeper-cli/zookeeper-cli.cpp b/utils/zookeeper-cli/zookeeper-cli.cpp index ba20f462735..44140423a15 100644 --- a/utils/zookeeper-cli/zookeeper-cli.cpp +++ b/utils/zookeeper-cli/zookeeper-cli.cpp @@ -1,17 +1,13 @@ -#include +#include +#include +#include #include +#include +#include +#include + #include #include -#include -#include -#include -#include - -#if USE_REPLXX -# include -#else -# include -#endif void printStat(const Coordination::Stat & s) @@ -74,11 +70,7 @@ int main(int argc, char ** argv) Logger::root().setLevel("trace"); zkutil::ZooKeeper zk(argv[1]); -#if USE_REPLXX - ReplxxLineReader lr(nullptr, {}, '\\'); -#else LineReader lr({}, '\\'); -#endif do { From 79085bf6290093a2ade815e32993423d660010aa Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Wed, 22 Jan 2020 00:33:33 +0530 Subject: [PATCH 29/89] Set `X-ClickHouse-Format` HTTP response header to the format name --- dbms/programs/server/HTTPHandler.cpp | 5 ++++- dbms/src/Interpreters/executeQuery.cpp | 10 ++++----- dbms/src/Interpreters/executeQuery.h | 2 +- .../queries/0_stateless/00265_content_type.sh | 12 ----------- ...> 00265_content_type_and_format.reference} | 21 ++++++++++++------- .../00265_content_type_and_format.sh | 12 +++++++++++ .../queries/0_stateless/00501_http_head.sh | 2 +- 7 files changed, 37 insertions(+), 27 deletions(-) delete mode 100755 dbms/tests/queries/0_stateless/00265_content_type.sh rename dbms/tests/queries/0_stateless/{00265_content_type.reference => 00265_content_type_and_format.reference} (59%) create mode 100755 dbms/tests/queries/0_stateless/00265_content_type_and_format.sh diff --git a/dbms/programs/server/HTTPHandler.cpp b/dbms/programs/server/HTTPHandler.cpp index b2b3298693e..f5ee8e313ec 100644 --- a/dbms/programs/server/HTTPHandler.cpp +++ b/dbms/programs/server/HTTPHandler.cpp @@ -590,7 +590,10 @@ void HTTPHandler::processQuery( customizeContext(context); executeQuery(*in, *used_output.out_maybe_delayed_and_compressed, /* allow_into_outfile = */ false, context, - [&response] (const String & content_type) { response.setContentType(content_type); }, + [&response] (const String & content_type, const String & format) { + response.setContentType(content_type); + response.add("X-ClickHouse-Format", format); + }, [&response] (const String & current_query_id) { response.add("X-ClickHouse-Query-Id", current_query_id); }); if (used_output.hasDelayed()) diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index 1f912738454..330ceb282a7 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -590,7 +590,7 @@ void executeQuery( WriteBuffer & ostr, bool allow_into_outfile, Context & context, - std::function set_content_type, + std::function set_content_type_and_format, std::function set_query_id) { PODArray parse_buf; @@ -680,8 +680,8 @@ void executeQuery( out->onProgress(progress); }); - if (set_content_type) - set_content_type(out->getContentType()); + if (set_content_type_and_format) + set_content_type_and_format(out->getContentType(), format_name); if (set_query_id) set_query_id(context.getClientInfo().current_query_id); @@ -742,8 +742,8 @@ void executeQuery( out->onProgress(progress); }); - if (set_content_type) - set_content_type(out->getContentType()); + if (set_content_type_and_format) + set_content_type_and_format(out->getContentType(), format_name); if (set_query_id) set_query_id(context.getClientInfo().current_query_id); diff --git a/dbms/src/Interpreters/executeQuery.h b/dbms/src/Interpreters/executeQuery.h index 3cff461f6d6..59b555b9f94 100644 --- a/dbms/src/Interpreters/executeQuery.h +++ b/dbms/src/Interpreters/executeQuery.h @@ -19,7 +19,7 @@ void executeQuery( WriteBuffer & ostr, /// Where to write query output to. bool allow_into_outfile, /// If true and the query contains INTO OUTFILE section, redirect output to that file. Context & context, /// DB, tables, data types, storage engines, functions, aggregate functions... - std::function set_content_type, /// If non-empty callback is passed, it will be called with the Content-Type of the result. + std::function set_content_type_and_format, /// If non-empty callback is passed, it will be called with the Content-Type and the Format of the result. std::function set_query_id /// If non-empty callback is passed, it will be called with the query id. ); diff --git a/dbms/tests/queries/0_stateless/00265_content_type.sh b/dbms/tests/queries/0_stateless/00265_content_type.sh deleted file mode 100755 index feddb46a6a4..00000000000 --- a/dbms/tests/queries/0_stateless/00265_content_type.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -. $CURDIR/../shell_config.sh - -${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&default_format=JSONCompact" --data-binary @- <<< "SELECT 1" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT JSON" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT TabSeparated" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Vertical" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Native" 2>&1 | grep '< Content-Type'; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT RowBinary" 2>&1 | grep '< Content-Type'; diff --git a/dbms/tests/queries/0_stateless/00265_content_type.reference b/dbms/tests/queries/0_stateless/00265_content_type_and_format.reference similarity index 59% rename from dbms/tests/queries/0_stateless/00265_content_type.reference rename to dbms/tests/queries/0_stateless/00265_content_type_and_format.reference index 0693d1118da..dbe9ebc0f58 100644 --- a/dbms/tests/queries/0_stateless/00265_content_type.reference +++ b/dbms/tests/queries/0_stateless/00265_content_type_and_format.reference @@ -1,7 +1,14 @@ -< Content-Type: application/json; charset=UTF-8 -< Content-Type: application/json; charset=UTF-8 -< Content-Type: text/tab-separated-values; charset=UTF-8 -< Content-Type: text/tab-separated-values; charset=UTF-8 -< Content-Type: text/plain; charset=UTF-8 -< Content-Type: application/octet-stream -< Content-Type: application/octet-stream +< Content-Type: application/json; charset=UTF-8 +< X-ClickHouse-Format: JSONCompact +< Content-Type: application/json; charset=UTF-8 +< X-ClickHouse-Format: JSON +< Content-Type: text/tab-separated-values; charset=UTF-8 +< X-ClickHouse-Format: TabSeparated +< Content-Type: text/tab-separated-values; charset=UTF-8 +< X-ClickHouse-Format: TabSeparated +< Content-Type: text/plain; charset=UTF-8 +< X-ClickHouse-Format: Vertical +< Content-Type: application/octet-stream +< X-ClickHouse-Format: Native +< Content-Type: application/octet-stream +< X-ClickHouse-Format: RowBinary diff --git a/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh b/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh new file mode 100755 index 00000000000..4888788c603 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&default_format=JSONCompact" --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT JSON" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT TabSeparated" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Vertical" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Native" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT RowBinary" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; diff --git a/dbms/tests/queries/0_stateless/00501_http_head.sh b/dbms/tests/queries/0_stateless/00501_http_head.sh index e235da3c192..df87743bb8d 100755 --- a/dbms/tests/queries/0_stateless/00501_http_head.sh +++ b/dbms/tests/queries/0_stateless/00501_http_head.sh @@ -4,7 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh ( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201"; - ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" + ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" if [[ `${CLICKHOUSE_CURL} -sS -X POST -I "${CLICKHOUSE_URL}&query=SELECT+1" | grep -c '411 Length Required'` -ne 1 ]]; then echo FAIL From 4cffb62b4185e9a41f82a69047cafeb1caceaa85 Mon Sep 17 00:00:00 2001 From: millb Date: Tue, 21 Jan 2020 22:49:42 +0300 Subject: [PATCH 30/89] Created exception_code column in query_log table --- dbms/src/Interpreters/QueryLog.cpp | 7 +++++++ .../01070_exception_code_in_query_log_table.reference | 5 +++++ .../01070_exception_code_in_query_log_table.sql | 6 ++++++ 3 files changed, 18 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference create mode 100644 dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql diff --git a/dbms/src/Interpreters/QueryLog.cpp b/dbms/src/Interpreters/QueryLog.cpp index d9b86ea91ea..af1bd1f6d0e 100644 --- a/dbms/src/Interpreters/QueryLog.cpp +++ b/dbms/src/Interpreters/QueryLog.cpp @@ -50,6 +50,7 @@ Block QueryLogElement::createBlock() {std::make_shared(), "query"}, {std::make_shared(), "exception"}, + {std::make_shared(), "exception_code"}, {std::make_shared(), "stack_trace"}, {std::make_shared(), "is_initial_query"}, @@ -108,6 +109,12 @@ void QueryLogElement::appendToBlock(Block & block) const columns[i++]->insertData(query.data(), query.size()); columns[i++]->insertData(exception.data(), exception.size()); + + UInt16 exception_code = 0; + if (exception != "") + exception_code = parse(exception.data() + 6, exception.size() - 6); /// pass "Code: " + columns[i++]->insert(exception_code); + columns[i++]->insertData(stack_trace.data(), stack_trace.size()); appendClientInfo(client_info, columns, i); diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference new file mode 100644 index 00000000000..c1b0845ce0d --- /dev/null +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference @@ -0,0 +1,5 @@ +0 +0 +0 +0 +60 diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql new file mode 100644 index 00000000000..123a72aeee0 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql @@ -0,0 +1,6 @@ +DROP TABLE IF EXISTS test_table; +SYSTEM FLUSH LOGS; +TRUNCATE TABLE system.query_log; +SELECT * FROM test_table; -- { serverError 60 } +SYSTEM FLUSH LOGS; +SELECT exception_code FROM system.query_log; From 2d599cb1cb651a5da97dbeb5ba2fed67d8d535aa Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 21 Jan 2020 23:28:35 +0300 Subject: [PATCH 31/89] Update HTTPHandler.cpp --- dbms/programs/server/HTTPHandler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/programs/server/HTTPHandler.cpp b/dbms/programs/server/HTTPHandler.cpp index f5ee8e313ec..a17490a2dc1 100644 --- a/dbms/programs/server/HTTPHandler.cpp +++ b/dbms/programs/server/HTTPHandler.cpp @@ -590,7 +590,8 @@ void HTTPHandler::processQuery( customizeContext(context); executeQuery(*in, *used_output.out_maybe_delayed_and_compressed, /* allow_into_outfile = */ false, context, - [&response] (const String & content_type, const String & format) { + [&response] (const String & content_type, const String & format) + { response.setContentType(content_type); response.add("X-ClickHouse-Format", format); }, From d414131c3cf52038c73579a0b77f169707c2aa27 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Wed, 22 Jan 2020 05:03:57 +0530 Subject: [PATCH 32/89] Update callback type and name --- dbms/programs/server/MySQLHandler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/programs/server/MySQLHandler.cpp b/dbms/programs/server/MySQLHandler.cpp index 64a78702bf0..7863d8ef159 100644 --- a/dbms/programs/server/MySQLHandler.cpp +++ b/dbms/programs/server/MySQLHandler.cpp @@ -282,7 +282,7 @@ void MySQLHandler::comQuery(ReadBuffer & payload) else { bool with_output = false; - std::function set_content_type = [&with_output](const String &) -> void { + std::function set_content_type_and_format = [&with_output](const String &, const String &) -> void { with_output = true; }; @@ -305,7 +305,7 @@ void MySQLHandler::comQuery(ReadBuffer & payload) ReadBufferFromString replacement(replacement_query); Context query_context = connection_context; - executeQuery(should_replace ? replacement : payload, *out, true, query_context, set_content_type, nullptr); + executeQuery(should_replace ? replacement : payload, *out, true, query_context, set_content_type_and_format, {}); if (!with_output) packet_sender->sendPacket(OK_Packet(0x00, client_capability_flags, 0, 0, 0), true); From e357c6feaf7b454e12bc8ebf8ecd0eebfa20968e Mon Sep 17 00:00:00 2001 From: Mikahil Nacharov Date: Wed, 22 Jan 2020 16:27:27 +0500 Subject: [PATCH 33/89] minor fixes in build-gcc script --- utils/ci/build-gcc-from-sources.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/ci/build-gcc-from-sources.sh b/utils/ci/build-gcc-from-sources.sh index 0734b22335a..06d9820a022 100755 --- a/utils/ci/build-gcc-from-sources.sh +++ b/utils/ci/build-gcc-from-sources.sh @@ -32,8 +32,8 @@ $SUDO make install popd popd -$SUDO ln -sf /usr/local/bin/gcc /usr/local/bin/gcc-${GCC_GCC_SOURCES_VERSION_SHORT} -$SUDO ln -sf /usr/local/bin/g++ /usr/local/bin/g++-${GCC_GCC_SOURCES_VERSION_SHORT} +$SUDO ln -sf /usr/local/bin/gcc /usr/local/bin/gcc-${GCC_VERSION_SHORT} +$SUDO ln -sf /usr/local/bin/g++ /usr/local/bin/g++-${GCC_VERSION_SHORT} $SUDO ln -sf /usr/local/bin/gcc /usr/local/bin/cc $SUDO ln -sf /usr/local/bin/g++ /usr/local/bin/c++ @@ -43,5 +43,5 @@ $SUDO ldconfig hash gcc g++ gcc --version -export CC=gcc -export CXX=g++ +export CC=gcc-${GCC_VERSION_SHORT} +export CXX=g++-${GCC_VERSION_SHORT} From 6cd6b4d3b2982e857a07c663465927716e56dbff Mon Sep 17 00:00:00 2001 From: millb Date: Wed, 22 Jan 2020 15:29:30 +0300 Subject: [PATCH 34/89] exception_code is changed --- dbms/src/Interpreters/QueryLog.cpp | 9 ++------- dbms/src/Interpreters/QueryLog.h | 1 + dbms/src/Interpreters/executeQuery.cpp | 2 ++ 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/dbms/src/Interpreters/QueryLog.cpp b/dbms/src/Interpreters/QueryLog.cpp index af1bd1f6d0e..b57b54ac710 100644 --- a/dbms/src/Interpreters/QueryLog.cpp +++ b/dbms/src/Interpreters/QueryLog.cpp @@ -49,8 +49,8 @@ Block QueryLogElement::createBlock() {std::make_shared(), "memory_usage"}, {std::make_shared(), "query"}, + {std::make_shared(), "exception_code"}, {std::make_shared(), "exception"}, - {std::make_shared(), "exception_code"}, {std::make_shared(), "stack_trace"}, {std::make_shared(), "is_initial_query"}, @@ -108,13 +108,8 @@ void QueryLogElement::appendToBlock(Block & block) const columns[i++]->insert(memory_usage); columns[i++]->insertData(query.data(), query.size()); - columns[i++]->insertData(exception.data(), exception.size()); - - UInt16 exception_code = 0; - if (exception != "") - exception_code = parse(exception.data() + 6, exception.size() - 6); /// pass "Code: " columns[i++]->insert(exception_code); - + columns[i++]->insertData(exception.data(), exception.size()); columns[i++]->insertData(stack_trace.data(), stack_trace.size()); appendClientInfo(client_info, columns, i); diff --git a/dbms/src/Interpreters/QueryLog.h b/dbms/src/Interpreters/QueryLog.h index 0bee61df394..a519ddb896d 100644 --- a/dbms/src/Interpreters/QueryLog.h +++ b/dbms/src/Interpreters/QueryLog.h @@ -54,6 +54,7 @@ struct QueryLogElement String query; + Int32 exception_code; // because ErrorCodes are int String exception; String stack_trace; diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index 1f912738454..a71450580b7 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -163,6 +163,7 @@ static void onExceptionBeforeStart(const String & query_for_logging, Context & c elem.query_start_time = current_time; elem.query = query_for_logging; + elem.exception_code = getCurrentExceptionCode(); elem.exception = getCurrentExceptionMessage(false); elem.client_info = context.getClientInfo(); @@ -496,6 +497,7 @@ static std::tuple executeQueryImpl( elem.event_time = time(nullptr); elem.query_duration_ms = 1000 * (elem.event_time - elem.query_start_time); + elem.exception_code = getCurrentExceptionCode(); elem.exception = getCurrentExceptionMessage(false); QueryStatus * process_list_elem = context.getProcessListElement(); From f3ff57e21ec070bc0bc67a3eff7e85b117c3e78f Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Tue, 21 Jan 2020 19:35:42 +0300 Subject: [PATCH 35/89] Fix build --- contrib/replxx-cmake/CMakeLists.txt | 2 +- docker/packager/packager | 2 +- libs/libcommon/src/LineReader.cpp | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 5a4ee3abf0f..1240eb56b39 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -51,7 +51,7 @@ if (ENABLE_REPLXX) message (STATUS "Using replxx") else () add_library(replxx INTERFACE) - target_compile_definitions(replxx PUBLIC USE_REPLXX=0) + target_compile_definitions(replxx INTERFACE USE_REPLXX=0) message (STATUS "Not using replxx (Beware! Runtime fallback to readline is possible!)") endif () diff --git a/docker/packager/packager b/docker/packager/packager index 62767cae8f0..a31a387d502 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -177,7 +177,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ if unbundled: # TODO: fix build with ENABLE_RDKAFKA - cmake_flags.append('-DUNBUNDLED=1 -DENABLE_MYSQL=0 -DENABLE_POCO_ODBC=0 -DENABLE_ODBC=0 -DENABLE_READLINE=0 -DENABLE_RDKAFKA=0') + cmake_flags.append('-DUNBUNDLED=1 -DENABLE_MYSQL=0 -DENABLE_POCO_ODBC=0 -DENABLE_ODBC=0 -DENABLE_REPLXX=0 -DENABLE_RDKAFKA=0') if split_binary: cmake_flags.append('-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1') diff --git a/libs/libcommon/src/LineReader.cpp b/libs/libcommon/src/LineReader.cpp index 2f3d986ad30..4a3a737fe7c 100644 --- a/libs/libcommon/src/LineReader.cpp +++ b/libs/libcommon/src/LineReader.cpp @@ -6,13 +6,15 @@ #include #include +#ifdef OS_LINUX /// We can detect if code is linked with one or another readline variants or open the library dynamically. -#include +# include extern "C" { char * readline(const char *) __attribute__((__weak__)); char * (*readline_ptr)(const char *) = readline; } +#endif namespace { @@ -112,6 +114,7 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) { input.clear(); +#ifdef OS_LINUX if (!readline_ptr) { for (auto name : {"libreadline.so", "libreadline.so.0", "libeditline.so", "libeditline.so.0"}) @@ -137,6 +140,7 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) input = line_read; } else +#endif { std::cout << prompt; std::getline(std::cin, input); From 6fc7a827f689793cdb8185239fdc6b6c358cf5b0 Mon Sep 17 00:00:00 2001 From: millb Date: Wed, 22 Jan 2020 16:52:26 +0300 Subject: [PATCH 36/89] Fixed bug with empty exception --- dbms/src/Interpreters/QueryLog.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/QueryLog.h b/dbms/src/Interpreters/QueryLog.h index a519ddb896d..c87188b1365 100644 --- a/dbms/src/Interpreters/QueryLog.h +++ b/dbms/src/Interpreters/QueryLog.h @@ -38,7 +38,7 @@ struct QueryLogElement time_t query_start_time{}; UInt64 query_duration_ms{}; - /// The data fetched from DB to execute the query + /// The data fetched from 1457488640DB to execute the query UInt64 read_rows{}; UInt64 read_bytes{}; @@ -54,7 +54,7 @@ struct QueryLogElement String query; - Int32 exception_code; // because ErrorCodes are int + Int32 exception_code{}; // because ErrorCodes are int String exception; String stack_trace; From d68f8d1f8e7d72db40684a2aca5f2fc7fb5e55c5 Mon Sep 17 00:00:00 2001 From: Mikhail Korotov <55493615+millb@users.noreply.github.com> Date: Wed, 22 Jan 2020 16:58:36 +0300 Subject: [PATCH 37/89] Update QueryLog.h --- dbms/src/Interpreters/QueryLog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/QueryLog.h b/dbms/src/Interpreters/QueryLog.h index c87188b1365..f14691df64e 100644 --- a/dbms/src/Interpreters/QueryLog.h +++ b/dbms/src/Interpreters/QueryLog.h @@ -38,7 +38,7 @@ struct QueryLogElement time_t query_start_time{}; UInt64 query_duration_ms{}; - /// The data fetched from 1457488640DB to execute the query + /// The data fetched from DB to execute the query UInt64 read_rows{}; UInt64 read_bytes{}; From eadb9022c80e21dee2f1dc3ac2fcc484279d75e2 Mon Sep 17 00:00:00 2001 From: millb Date: Wed, 22 Jan 2020 18:32:38 +0300 Subject: [PATCH 38/89] Tests are changed --- .../01070_exception_code_in_query_log_table.reference | 6 ++---- .../01070_exception_code_in_query_log_table.sql | 11 ++++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference index c1b0845ce0d..ffe29f4e599 100644 --- a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference @@ -1,5 +1,3 @@ -0 -0 -0 -0 60 +0 +0 diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql index 123a72aeee0..f90bf6107d1 100644 --- a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql @@ -1,6 +1,7 @@ -DROP TABLE IF EXISTS test_table; +DROP TABLE IF EXISTS test_table_for_01070_exception_code_in_query_log_table; +SELECT * FROM test_table_for_01070_exception_code_in_query_log_table; -- { serverError 60 } +CREATE TABLE test_table_for_01070_exception_code_in_query_log_table (value UInt64) ENGINE=Memory(); +SELECT * FROM test_table_for_01070_exception_code_in_query_log_table; SYSTEM FLUSH LOGS; -TRUNCATE TABLE system.query_log; -SELECT * FROM test_table; -- { serverError 60 } -SYSTEM FLUSH LOGS; -SELECT exception_code FROM system.query_log; +SELECT exception_code FROM system.query_log WHERE query='SELECT * FROM test_table_for_01070_exception_code_in_query_log_table'; +DROP TABLE IF EXISTS test_table_for_01070_exception_code_in_query_log_table; From 8f241eab492ee99166dbeee0fb2387207df134e8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 22 Jan 2020 19:11:34 +0300 Subject: [PATCH 39/89] Add image for woboq codebrowser report --- docker/test/codebrowser/Dockerfile | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 docker/test/codebrowser/Dockerfile diff --git a/docker/test/codebrowser/Dockerfile b/docker/test/codebrowser/Dockerfile new file mode 100644 index 00000000000..0c612dff0bd --- /dev/null +++ b/docker/test/codebrowser/Dockerfile @@ -0,0 +1,45 @@ +# docker build --network=host -t yandex/clickhouse-codebrowser . +# docker run --volume=path_to_repo:/repo_folder --volume=path_to_result:/test_output yandex/clickhouse-codebrowser +FROM ubuntu:18.04 + +RUN apt-get --allow-unauthenticated update -y \ + && env DEBIAN_FRONTEND=noninteractive \ + apt-get --allow-unauthenticated install --yes --no-install-recommends \ + bash \ + sudo \ + wget \ + software-properties-common \ + ca-certificates \ + apt-transport-https \ + build-essential \ + gpg-agent \ + git + +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add - +RUN sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' +RUN sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list + +RUN sudo apt-get --yes --allow-unauthenticated update +# To build woboq +RUN sudo apt-get --yes --allow-unauthenticated install cmake clang-8 libllvm8 libclang-8-dev + +# repo versions doesn't work correctly with C++17 +RUN git clone https://github.com/woboq/woboq_codebrowser.git +RUN cd woboq_codebrowser && cmake . -DCMAKE_BUILD_TYPE=Release && make -j + +ENV CODEGEN=/woboq_codebrowser/generator/codebrowser_generator +ENV CODEINDEX=/woboq_codebrowser/indexgenerator/codebrowser_indexgenerator +ENV STATIC_DATA=/woboq_codebrowser/data + +ENV SOURCE_DIRECTORY=/repo_folder +ENV BUILD_DIRECTORY=/build +ENV HTML_RESULT_DIRECTORY=$BUILD_DIRECTORY/html_report +ENV SHA=nosha + +CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \ + cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-8 -DCMAKE_C_COMPILER=/usr/bin/clang-8 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON && \ + mkdir -p $HTML_RESULT_DIRECTORY && \ + $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA && \ + $CODEINDEX $HTML_RESULT_DIRECTORY && \ + cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\ + mv $HTML_RESULT_DIRECTORY /test_output From f0b7422dcb6ca046fe9d3c3a27312c7c507fe627 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 22 Jan 2020 19:56:49 +0300 Subject: [PATCH 40/89] Loop queries for math perftest. --- dbms/tests/performance/math.xml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dbms/tests/performance/math.xml b/dbms/tests/performance/math.xml index 5f4f302a0e8..f4d31713a08 100644 --- a/dbms/tests/performance/math.xml +++ b/dbms/tests/performance/math.xml @@ -1,14 +1,18 @@ - once + + loop + + 5 + 10000 + - 1000 - 10000 + 50 + 60000 - func @@ -37,7 +41,7 @@ - SELECT count() FROM system.numbers WHERE NOT ignore({func}(toFloat64(number))) - SELECT count() FROM system.numbers WHERE NOT ignore({func}(toFloat32(number))) - SELECT count() FROM system.numbers WHERE NOT ignore({func}(number)) + SELECT count() FROM numbers(100000000) WHERE NOT ignore({func}(toFloat64(number))) + SELECT count() FROM numbers(100000000) WHERE NOT ignore({func}(toFloat32(number))) + SELECT count() FROM numbers(100000000) WHERE NOT ignore({func}(number)) From b6fe187e591ca3e33df88aec3940d5b351285357 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Wed, 22 Jan 2020 22:52:04 +0530 Subject: [PATCH 41/89] Fix test/check failures --- dbms/programs/server/MySQLHandler.cpp | 3 ++- .../0_stateless/00265_content_type_and_format.sh | 14 +++++++------- dbms/tests/queries/0_stateless/00501_http_head.sh | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dbms/programs/server/MySQLHandler.cpp b/dbms/programs/server/MySQLHandler.cpp index 7863d8ef159..9dd107f9d5f 100644 --- a/dbms/programs/server/MySQLHandler.cpp +++ b/dbms/programs/server/MySQLHandler.cpp @@ -282,7 +282,8 @@ void MySQLHandler::comQuery(ReadBuffer & payload) else { bool with_output = false; - std::function set_content_type_and_format = [&with_output](const String &, const String &) -> void { + std::function set_content_type_and_format = [&with_output](const String &, const String &) -> void + { with_output = true; }; diff --git a/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh b/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh index 4888788c603..2a36a17c6a1 100755 --- a/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh +++ b/dbms/tests/queries/0_stateless/00265_content_type_and_format.sh @@ -3,10 +3,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh -${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&default_format=JSONCompact" --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT JSON" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT TabSeparated" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Vertical" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Native" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; -${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT RowBinary" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sort; +${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&default_format=JSONCompact" --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT JSON" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT TabSeparated" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Vertical" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT Native" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; +${CLICKHOUSE_CURL} -vsS ${CLICKHOUSE_URL} --data-binary @- <<< "SELECT 1 FORMAT RowBinary" 2>&1 | grep -e '< Content-Type' -e '< X-ClickHouse-Format' | sed 's/\r$//' | sort; diff --git a/dbms/tests/queries/0_stateless/00501_http_head.sh b/dbms/tests/queries/0_stateless/00501_http_head.sh index df87743bb8d..7251fc2cf21 100755 --- a/dbms/tests/queries/0_stateless/00501_http_head.sh +++ b/dbms/tests/queries/0_stateless/00501_http_head.sh @@ -4,7 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh ( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201"; - ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" + ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" if [[ `${CLICKHOUSE_CURL} -sS -X POST -I "${CLICKHOUSE_URL}&query=SELECT+1" | grep -c '411 Length Required'` -ne 1 ]]; then echo FAIL From 55be790199c0cd4ded42761341a8d2c5feff84e5 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 22 Jan 2020 21:26:16 +0300 Subject: [PATCH 42/89] Do not crash if the row template file is empty. --- .../Formats/ParsedTemplateFormatString.cpp | 25 +++++++------------ .../01070_template_empty_file.reference | 0 .../0_stateless/01070_template_empty_file.sql | 2 ++ 3 files changed, 11 insertions(+), 16 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01070_template_empty_file.reference create mode 100644 dbms/tests/queries/0_stateless/01070_template_empty_file.sql diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index 981d43089a2..817f2c205f0 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -16,20 +16,10 @@ namespace ErrorCodes ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) { - try - { - ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); - String format_string; - readStringUntilEOF(format_string, schema_file); - parse(format_string, idx_by_name); - } - catch (DB::Exception & e) - { - if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) - throwInvalidFormat(e.message(), columnsCount()); - else - throw; - } + ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); + String format_string; + readStringUntilEOF(format_string, schema_file); + parse(format_string, idx_by_name); } @@ -193,8 +183,11 @@ const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const cha String ParsedTemplateFormatString::dump() const { WriteBufferFromOwnString res; - res << "Delimiter " << 0 << ": "; - verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); + res << "\nDelimiter " << 0 << ": "; + if (delimiters.size() <= 1) + res << ""; + else + verbosePrintString(delimiters[0].c_str(), delimiters[0].c_str() + delimiters[0].size(), res); size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size()); for (size_t i = 0; i < num_columns; ++i) diff --git a/dbms/tests/queries/0_stateless/01070_template_empty_file.reference b/dbms/tests/queries/0_stateless/01070_template_empty_file.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01070_template_empty_file.sql b/dbms/tests/queries/0_stateless/01070_template_empty_file.sql new file mode 100644 index 00000000000..46a8f38f80b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01070_template_empty_file.sql @@ -0,0 +1,2 @@ +select 1 format Template settings format_template_row='01070_nonexistent_file.txt'; -- { clientError 107 } +select 1 format Template settings format_template_row='/dev/null'; -- { clientError 474 } From c00636bfa78e1bc665a8e001c965c17ea2e29d3e Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 22 Jan 2020 22:23:59 +0300 Subject: [PATCH 43/89] Update ParsedTemplateFormatString.cpp --- dbms/src/Formats/ParsedTemplateFormatString.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index 817f2c205f0..1dc8dd25f6f 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -19,7 +19,17 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); String format_string; readStringUntilEOF(format_string, schema_file); - parse(format_string, idx_by_name); + try + { + parse(format_string, idx_by_name); + } + catch (DB::Exception & e) + { + if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) + throwInvalidFormat(e.message(), columnsCount()); + else + throw; + } } From 8cfe9a4d66208f0f8a12352df15f474675989348 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 22 Jan 2020 23:09:23 +0300 Subject: [PATCH 44/89] Update ParsedTemplateFormatString.cpp --- dbms/src/Formats/ParsedTemplateFormatString.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index 1dc8dd25f6f..af6fc39c8fd 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -194,10 +194,7 @@ String ParsedTemplateFormatString::dump() const { WriteBufferFromOwnString res; res << "\nDelimiter " << 0 << ": "; - if (delimiters.size() <= 1) - res << ""; - else - verbosePrintString(delimiters[0].c_str(), delimiters[0].c_str() + delimiters[0].size(), res); + verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size()); for (size_t i = 0; i < num_columns; ++i) From a02b59f3009b3a65ae67450a2da20c4d68914cac Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 00:10:33 +0300 Subject: [PATCH 45/89] Update roadmap --- docs/ru/extended_roadmap.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/ru/extended_roadmap.md b/docs/ru/extended_roadmap.md index 8448822282d..145779cfccb 100644 --- a/docs/ru/extended_roadmap.md +++ b/docs/ru/extended_roadmap.md @@ -145,6 +145,8 @@ Q2. Upd. На данный момент исправляются проблемы с регрессиями производительности в отдельных случаях. Кажется, что все проблемы исправлены. Включение по-умолчанию в Q1, но остаётся вторая часть задачи по корректному выделению async части. +Upd. Включили по-умолчанию. Удаление старого кода не раньше, чем после первого релиза, в котором это включено по-умолчанию и всё ещё можно выключить обратно. + ### 2.2. Инфраструктура событий/метрик/ограничений/квот/трассировки. В очереди. https://gist.github.com/alexey-milovidov/d62d73222d83b9319dc519cbb13aeff6 @@ -214,10 +216,12 @@ Upd. На данный момент исправляются проблемы с Требует 3.1. -### 3.3. Исправить катастрофически отвратительно неприемлемый поиск по документации. +### + 3.3. Исправить катастрофически отвратительно неприемлемый поиск по документации. [Иван Блинков](https://github.com/blinkov/) - очень хороший человек. Сам сайт документации основан на технологиях, не удовлетворяющих требованиям задачи, и эти технологии трудно исправить. Задачу будет делать первый встретившийся нам frontend разработчик, которого мы сможем заставить это сделать. +Upd. Иван Блинков сделал эту задачу путём замены треш-технологий на нормальные. + ### 3.4. + Добавить японский язык в документацию. Эту задачу сделает [Иван Блинков](https://github.com/blinkov/), до конца декабря 2019. Сделано. From ef93eb47a6360e2a068dc89dd3e5855c83d811b7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 02:50:57 +0300 Subject: [PATCH 46/89] Removed old garbage --- dbms/src/Core/Types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/dbms/src/Core/Types.h b/dbms/src/Core/Types.h index ea80ab7d427..4f350ba00d5 100644 --- a/dbms/src/Core/Types.h +++ b/dbms/src/Core/Types.h @@ -31,7 +31,6 @@ enum class TypeIndex Float64, Date, DateTime, - DateTime32 = DateTime, DateTime64, String, FixedString, @@ -158,8 +157,6 @@ using Decimal32 = Decimal; using Decimal64 = Decimal; using Decimal128 = Decimal; -// TODO (nemkov): consider making a strong typedef -//using DateTime32 = time_t; using DateTime64 = Decimal64; template <> struct TypeName { static const char * get() { return "Decimal32"; } }; From b213f08f1c81dfafb79c125293448cbb41278e93 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 03:04:50 +0300 Subject: [PATCH 47/89] Added type conversions in AvroInputFormat --- .../Formats/Impl/AvroRowInputFormat.cpp | 115 ++++++++++-------- 1 file changed, 63 insertions(+), 52 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 649ed777c4f..042775fcbb2 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -105,6 +105,51 @@ static void deserializeNoop(IColumn &, avro::Decoder &) { } +/// Insert value with conversion to the column of target type. +template +static void insertNumber(IColumn & column, WhichDataType type, T value) +{ + switch (type.idx) + { + case TypeIndex::UInt8: + assert_cast(column).insertValue(value); + break; + case TypeIndex::Date: [[fallthrough]]; + case TypeIndex::UInt16: + assert_cast(column).insertValue(value); + break; + case TypeIndex::DateTime: [[fallthrough]]; + case TypeIndex::UInt32: + assert_cast(column).insertValue(value); + break; + case TypeIndex::DateTime64: [[fallthrough]]; + case TypeIndex::UInt64: + assert_cast(column).insertValue(value); + break; + case TypeIndex::Int8, + assert_cast(column).insertValue(value); + break; + case TypeIndex::Int16, + assert_cast(column).insertValue(value); + break; + case TypeIndex::Int32, + assert_cast(column).insertValue(value); + break; + case TypeIndex::Int64, + assert_cast(column).insertValue(value); + break; + case TypeIndex::Float32, + assert_cast(column).insertValue(value); + break; + case TypeIndex::Float64, + assert_cast(column).insertValue(value); + break; + default: + throw Exception("Type " + type->getName() + " is not compatible with Avro", ErrorCodes::ILLEGAL_COLUMN); + } +} + + AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::NodePtr root_node, DataTypePtr target_type) { auto logical_type = root_node->logicalType().type(); @@ -123,68 +168,34 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node } break; case avro::AVRO_INT: - if (target.isInt32()) + return [target](IColumn & column, avro::Decoder & decoder) { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeInt()); - }; - } - if (target.isDate() && logical_type == avro::LogicalType::DATE) - { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeInt()); - }; - } + insertValue(column, target, decoder.decodeInt()); + }; break; case avro::AVRO_LONG: - if (target.isInt64()) + return [target](IColumn & column, avro::Decoder & decoder) { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeLong()); - }; - } - if (target.isDateTime64()) - { - auto date_time_scale = assert_cast(*target_type).getScale(); - if ((logical_type == avro::LogicalType::TIMESTAMP_MILLIS && date_time_scale == 3) - || (logical_type == avro::LogicalType::TIMESTAMP_MICROS && date_time_scale == 6)) - { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeLong()); - }; - } - } + insertValue(column, target, decoder.decodeLong()); + }; break; case avro::AVRO_FLOAT: - if (target.isFloat32()) + return [target](IColumn & column, avro::Decoder & decoder) { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeFloat()); - }; - } + insertValue(column, target, decoder.decodeFloat()); + }; break; case avro::AVRO_DOUBLE: - if (target.isFloat64()) + return [target](IColumn & column, avro::Decoder & decoder) { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeDouble()); - }; - } + insertValue(column, target, decoder.decodeDouble()); + }; break; case avro::AVRO_BOOL: - if (target.isUInt8()) + return [target](IColumn & column, avro::Decoder & decoder) { - return [](IColumn & column, avro::Decoder & decoder) - { - assert_cast(column).insertValue(decoder.decodeBool()); - }; - } + insertValue(column, target, decoder.decodeBool()); + }; break; case avro::AVRO_ARRAY: if (target.isArray()) @@ -304,14 +315,14 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node } break; } - case avro::AVRO_MAP: - case avro::AVRO_RECORD: + case avro::AVRO_MAP: [[fallthrough]]; + case avro::AVRO_RECORD: [[fallthrough]]; default: break; } throw Exception( - "Type " + target_type->getName() + " is not compatible" + " with Avro " + avro::ValidSchema(root_node).toJson(false), + "Type " + target_type->getName() + " is not compatible with Avro " + avro::ValidSchema(root_node).toJson(false), ErrorCodes::ILLEGAL_COLUMN); } From 0c18bcf4158da1ca31de55551971f58fb9521fd9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 03:41:37 +0300 Subject: [PATCH 48/89] Addition to prev. revision --- .../Formats/Impl/AvroRowInputFormat.cpp | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 042775fcbb2..ba3eb3d32aa 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -76,7 +76,7 @@ class InputStreamReadBufferAdapter : public avro::InputStream public: InputStreamReadBufferAdapter(ReadBuffer & in_) : in(in_) {} - bool next(const uint8_t ** data, size_t * len) + bool next(const uint8_t ** data, size_t * len) override { if (in.eof()) { @@ -91,11 +91,11 @@ public: return true; } - void backup(size_t len) { in.position() -= len; } + void backup(size_t len) override { in.position() -= len; } - void skip(size_t len) { in.tryIgnore(len); } + void skip(size_t len) override { in.tryIgnore(len); } - size_t byteCount() const { return in.count(); } + size_t byteCount() const override { return in.count(); } private: ReadBuffer & in; @@ -126,33 +126,32 @@ static void insertNumber(IColumn & column, WhichDataType type, T value) case TypeIndex::UInt64: assert_cast(column).insertValue(value); break; - case TypeIndex::Int8, + case TypeIndex::Int8: assert_cast(column).insertValue(value); break; - case TypeIndex::Int16, + case TypeIndex::Int16: assert_cast(column).insertValue(value); break; - case TypeIndex::Int32, + case TypeIndex::Int32: assert_cast(column).insertValue(value); break; - case TypeIndex::Int64, + case TypeIndex::Int64: assert_cast(column).insertValue(value); break; - case TypeIndex::Float32, + case TypeIndex::Float32: assert_cast(column).insertValue(value); break; - case TypeIndex::Float64, + case TypeIndex::Float64: assert_cast(column).insertValue(value); break; default: - throw Exception("Type " + type->getName() + " is not compatible with Avro", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Type is not compatible with Avro", ErrorCodes::ILLEGAL_COLUMN); } } AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::NodePtr root_node, DataTypePtr target_type) { - auto logical_type = root_node->logicalType().type(); WhichDataType target(target_type); switch (root_node->type()) { @@ -170,33 +169,28 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node case avro::AVRO_INT: return [target](IColumn & column, avro::Decoder & decoder) { - insertValue(column, target, decoder.decodeInt()); + insertNumber(column, target, decoder.decodeInt()); }; - break; case avro::AVRO_LONG: return [target](IColumn & column, avro::Decoder & decoder) { - insertValue(column, target, decoder.decodeLong()); + insertNumber(column, target, decoder.decodeLong()); }; - break; case avro::AVRO_FLOAT: return [target](IColumn & column, avro::Decoder & decoder) { - insertValue(column, target, decoder.decodeFloat()); + insertNumber(column, target, decoder.decodeFloat()); }; - break; case avro::AVRO_DOUBLE: return [target](IColumn & column, avro::Decoder & decoder) { - insertValue(column, target, decoder.decodeDouble()); + insertNumber(column, target, decoder.decodeDouble()); }; - break; case avro::AVRO_BOOL: return [target](IColumn & column, avro::Decoder & decoder) { - insertValue(column, target, decoder.decodeBool()); + insertNumber(column, target, decoder.decodeBool()); }; - break; case avro::AVRO_ARRAY: if (target.isArray()) { From 39565eb0b00df194491cb51473902e832592e74a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 04:03:32 +0300 Subject: [PATCH 49/89] Addition to prev. revision --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index ba3eb3d32aa..a792b6c871f 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -399,7 +399,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) }; } default: - throw Exception("Unsupported Avro type", ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Unsupported Avro type " + avro::ValidSchema(root_node).toJson(false), ErrorCodes::ILLEGAL_COLUMN); } } From fde33ddc4790a3819bd482f406354b2a57cdbc4d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 04:06:53 +0300 Subject: [PATCH 50/89] Addition to prev. revision --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a792b6c871f..ed30fb21e58 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -399,7 +399,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) }; } default: - throw Exception("Unsupported Avro type " + avro::ValidSchema(root_node).toJson(false), ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Unsupported Avro type " + root_node->name().fullname(), ErrorCodes::ILLEGAL_COLUMN); } } From 9f0230d4ba9cc9bfe01355fbfd9b84ead8d4dcb2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 04:09:17 +0300 Subject: [PATCH 51/89] Addition to prev. revision --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index ed30fb21e58..b702e42c4f0 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -399,7 +399,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) }; } default: - throw Exception("Unsupported Avro type " + root_node->name().fullname(), ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Unsupported Avro type " + root_node->name().fullname() + " (" + toString(int(root_node->type())) + ")", ErrorCodes::ILLEGAL_COLUMN); } } From a75ce4477bc8e60c848947e2d78140650a561ba8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:01:58 +0300 Subject: [PATCH 52/89] Fixed error in Avro format --- .../Formats/Impl/AvroRowOutputFormat.cpp | 23 +++++++++++-------- .../Formats/Impl/AvroRowOutputFormat.h | 4 +++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index b0375c7e2ae..c32f46552b1 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -78,8 +78,10 @@ private: }; -AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeFn(DataTypePtr data_type) +AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeFn(DataTypePtr data_type, size_t & type_name_increment) { + ++type_name_increment; + switch (data_type->getTypeId()) { case TypeIndex::UInt8: @@ -169,7 +171,8 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF }}; case TypeIndex::FixedString: { - auto schema = avro::FixedSchema(data_type->getSizeOfValueInMemory(), "fixed"); + auto size = data_type->getSizeOfValueInMemory(); + auto schema = avro::FixedSchema(size, "fixed" + toString(size)); return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const StringRef & s = assert_cast(column).getDataAt(row_num); @@ -178,7 +181,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF } case TypeIndex::Enum8: { - auto schema = avro::EnumSchema("enum8"); + auto schema = avro::EnumSchema("enum8_" + toString(type_name_increment)); /// type names must be different for different types. std::unordered_map enum_mapping; const auto & enum_values = assert_cast(*data_type).getValues(); for (size_t i = 0; i < enum_values.size(); ++i) @@ -194,7 +197,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF } case TypeIndex::Enum16: { - auto schema = avro::EnumSchema("enum16"); + auto schema = avro::EnumSchema("enum16" + toString(type_name_increment)); std::unordered_map enum_mapping; const auto & enum_values = assert_cast(*data_type).getValues(); for (size_t i = 0; i < enum_values.size(); ++i) @@ -211,7 +214,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF case TypeIndex::Array: { const auto & array_type = assert_cast(*data_type); - auto nested_mapping = createSchemaWithSerializeFn(array_type.getNestedType()); + auto nested_mapping = createSchemaWithSerializeFn(array_type.getNestedType(), type_name_increment); auto schema = avro::ArraySchema(nested_mapping.schema); return {schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { @@ -237,7 +240,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF case TypeIndex::Nullable: { auto nested_type = removeNullable(data_type); - auto nested_mapping = createSchemaWithSerializeFn(nested_type); + auto nested_mapping = createSchemaWithSerializeFn(nested_type, type_name_increment); if (nested_type->getTypeId() == TypeIndex::Nothing) { return nested_mapping; @@ -266,7 +269,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF case TypeIndex::LowCardinality: { const auto & nested_type = removeLowCardinality(data_type); - auto nested_mapping = createSchemaWithSerializeFn(nested_type); + auto nested_mapping = createSchemaWithSerializeFn(nested_type, type_name_increment); return {nested_mapping.schema, [nested_mapping](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const auto & col = assert_cast(column); @@ -285,11 +288,13 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF AvroSerializer::AvroSerializer(const ColumnsWithTypeAndName & columns) { avro::RecordSchema record_schema("row"); + + size_t type_name_increment = 0; for (auto & column : columns) { try { - auto field_mapping = createSchemaWithSerializeFn(column.type); + auto field_mapping = createSchemaWithSerializeFn(column.type, type_name_increment); serialize_fns.push_back(field_mapping.serialize); //TODO: verify name starts with A-Za-z_ record_schema.addField(column.name, field_mapping.schema); @@ -312,7 +317,7 @@ void AvroSerializer::serializeRow(const Columns & columns, size_t row_num, avro: } } -static avro::Codec getCodec(const std::string& codec_name) +static avro::Codec getCodec(const std::string & codec_name) { if (codec_name == "") { diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h index efe63c1a72f..4d404337d74 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.h @@ -32,7 +32,9 @@ private: avro::Schema schema; SerializeFn serialize; }; - static SchemaWithSerializeFn createSchemaWithSerializeFn(DataTypePtr data_type); + + /// Type names for different complex types (e.g. enums, fixed strings) must be unique. We use simple incremental number to give them different names. + static SchemaWithSerializeFn createSchemaWithSerializeFn(DataTypePtr data_type, size_t & type_name_increment); std::vector serialize_fns; avro::ValidSchema schema; From 35726ac2dd304ef70b3d4dd8922a8dd758e95639 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:03:53 +0300 Subject: [PATCH 53/89] Fixed error in Avro format --- dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index c32f46552b1..26b427dfa31 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -172,7 +172,7 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF case TypeIndex::FixedString: { auto size = data_type->getSizeOfValueInMemory(); - auto schema = avro::FixedSchema(size, "fixed" + toString(size)); + auto schema = avro::FixedSchema(size, "fixed_" + toString(type_name_increment)); return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) { const StringRef & s = assert_cast(column).getDataAt(row_num); From fb89ffdbb3a1cbe6144858eb49f48cab6ad0898d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:12:11 +0300 Subject: [PATCH 54/89] Added comments --- dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h index 353f611a36e..0fb979b4f4e 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -30,8 +30,13 @@ private: static DeserializeFn createDeserializeFn(avro::NodePtr root_node, DataTypePtr target_type); static SkipFn createSkipFn(avro::NodePtr root_node); + /// Map from field index in Avro schema to column number in block header. Or -1 if there is no corresponding column. std::vector field_mapping; + + /// How to skip the corresponding field in Avro schema. std::vector skip_fns; + + /// How to deserialize the corresponding field in Avro schema. std::vector deserialize_fns; }; From 9a8211cbffa09e2ff37acec8d45608bc44d0c6e6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:39:58 +0300 Subject: [PATCH 55/89] Added Avro roundtrip test --- dbms/tests/queries/1_stateful/00154_avro.reference | 2 ++ dbms/tests/queries/1_stateful/00154_avro.sql | 9 +++++++++ 2 files changed, 11 insertions(+) create mode 100644 dbms/tests/queries/1_stateful/00154_avro.reference create mode 100644 dbms/tests/queries/1_stateful/00154_avro.sql diff --git a/dbms/tests/queries/1_stateful/00154_avro.reference b/dbms/tests/queries/1_stateful/00154_avro.reference new file mode 100644 index 00000000000..7e243047e8b --- /dev/null +++ b/dbms/tests/queries/1_stateful/00154_avro.reference @@ -0,0 +1,2 @@ +17300372046749301651 +17300372046749301651 diff --git a/dbms/tests/queries/1_stateful/00154_avro.sql b/dbms/tests/queries/1_stateful/00154_avro.sql new file mode 100644 index 00000000000..3d43a23e516 --- /dev/null +++ b/dbms/tests/queries/1_stateful/00154_avro.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS test.avro; + +CREATE TABLE test.avro AS test.hits ENGINE = File(Avro); +INSERT INTO test.avro SELECT * FROM test.hits WHERE intHash64(WatchID) % 100 = 0; + +SELECT sum(cityHash64(*)) FROM test.hits WHERE intHash64(WatchID) % 100 = 0; +SELECT sum(cityHash64(*)) FROM test.avro; + +DROP TABLE test.avro; From d853cffe619501fca55ac726cae370c46cf44463 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:44:22 +0300 Subject: [PATCH 56/89] Returned back DateTime64 --- .../Formats/Impl/AvroRowInputFormat.cpp | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index b702e42c4f0..acbd892eb48 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -172,10 +172,27 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node insertNumber(column, target, decoder.decodeInt()); }; case avro::AVRO_LONG: - return [target](IColumn & column, avro::Decoder & decoder) + if (target.isDateTime64()) { - insertNumber(column, target, decoder.decodeLong()); - }; + auto date_time_scale = assert_cast(*target_type).getScale(); + auto logical_type = root_node->logicalType().type(); + if ((logical_type == avro::LogicalType::TIMESTAMP_MILLIS && date_time_scale == 3) + || (logical_type == avro::LogicalType::TIMESTAMP_MICROS && date_time_scale == 6)) + { + return [](IColumn & column, avro::Decoder & decoder) + { + assert_cast(column).insertValue(decoder.decodeLong()); + }; + } + } + else + { + return [target](IColumn & column, avro::Decoder & decoder) + { + insertNumber(column, target, decoder.decodeLong()); + }; + } + break; case avro::AVRO_FLOAT: return [target](IColumn & column, avro::Decoder & decoder) { @@ -411,16 +428,19 @@ AvroDeserializer::AvroDeserializer(const ColumnsWithTypeAndName & columns, avro: { throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); } + field_mapping.resize(schema_root->leaves(), -1); + for (size_t i = 0; i < schema_root->leaves(); ++i) { skip_fns.push_back(createSkipFn(schema_root->leafAt(i))); deserialize_fns.push_back(&deserializeNoop); } + for (size_t i = 0; i < columns.size(); ++i) { const auto & column = columns[i]; - size_t field_index; + size_t field_index = 0; if (!schema_root->nameIndex(column.name, field_index)) { throw Exception("Field " + column.name + " not found in Avro schema", ErrorCodes::THERE_IS_NO_COLUMN); From 64baafd24511aa97a0394092bb64510aec3327ea Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 05:51:49 +0300 Subject: [PATCH 57/89] Updated Avro test --- dbms/tests/queries/0_stateless/01060_avro.reference | 4 ++-- dbms/tests/queries/0_stateless/01060_avro.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/tests/queries/0_stateless/01060_avro.reference b/dbms/tests/queries/0_stateless/01060_avro.reference index f8b3434177d..21fcc53f081 100644 --- a/dbms/tests/queries/0_stateless/01060_avro.reference +++ b/dbms/tests/queries/0_stateless/01060_avro.reference @@ -21,7 +21,7 @@ 1000 = other 0 -not compatible +1000 not found === output = primitive @@ -33,4 +33,4 @@ not found = other 0 1000 -not supported +147 diff --git a/dbms/tests/queries/0_stateless/01060_avro.sh b/dbms/tests/queries/0_stateless/01060_avro.sh index c92cba188d7..b57a7ad7a85 100755 --- a/dbms/tests/queries/0_stateless/01060_avro.sh +++ b/dbms/tests/queries/0_stateless/01060_avro.sh @@ -37,7 +37,7 @@ echo = other #no data cat $DATA_DIR/empty.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table' # type mismatch -cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int32' -q 'select count() from table' 2>&1 | grep -i 'not compatible' -o +cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int32' -q 'select count() from table' # field not found cat $DATA_DIR/simple.null.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'b Int64' -q 'select count() from table' 2>&1 | grep -i 'not found' -o @@ -66,5 +66,5 @@ S4="a Int64" ${CLICKHOUSE_LOCAL} -q "select toInt64(number) as a from numbers(0) format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S4" -q 'select count() from table' ${CLICKHOUSE_LOCAL} -q "select toInt64(number) as a from numbers(1000) format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S4" -q 'select count() from table' -# type not supported -${CLICKHOUSE_LOCAL} -q "select toInt16(123) as a format Avro" 2>&1 | grep -i 'not supported' -o \ No newline at end of file +# type supported via conversion +${CLICKHOUSE_LOCAL} -q "select toInt16(123) as a format Avro" | wc -c \ No newline at end of file From b80e3dcd467a73f28a15f8c01890c502c6a87c32 Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Thu, 23 Jan 2020 11:18:19 +0300 Subject: [PATCH 58/89] Refactoring of replxx (#8748) --- CMakeLists.txt | 1 - cmake/find/replxx.cmake | 40 ---------- contrib/CMakeLists.txt | 4 +- contrib/replxx-cmake/CMakeLists.txt | 71 ++++++++++++++---- dbms/programs/client/CMakeLists.txt | 2 +- dbms/programs/client/Client.cpp | 13 +++- ...StorageSystemBuildOptions.generated.cpp.in | 1 - docker/packager/packager | 2 +- libs/libcommon/CMakeLists.txt | 18 ++++- libs/libcommon/include/common/LineReader.h | 24 +++--- .../include/common/ReplxxLineReader.h | 18 +++++ .../include/common/config_common.h.in | 1 - libs/libcommon/src/LineReader.cpp | 74 +++---------------- libs/libcommon/src/ReplxxLineReader.cpp | 57 ++++++++++++++ utils/zookeeper-cli/CMakeLists.txt | 2 +- utils/zookeeper-cli/zookeeper-cli.cpp | 15 ++-- 16 files changed, 191 insertions(+), 152 deletions(-) delete mode 100644 cmake/find/replxx.cmake create mode 100644 libs/libcommon/include/common/ReplxxLineReader.h create mode 100644 libs/libcommon/src/ReplxxLineReader.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d37cdfc3af8..7c8ccb6e17c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -352,7 +352,6 @@ include (cmake/find/simdjson.cmake) include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/orc.cmake) -include (cmake/find/replxx.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) diff --git a/cmake/find/replxx.cmake b/cmake/find/replxx.cmake deleted file mode 100644 index 3a0e5917b04..00000000000 --- a/cmake/find/replxx.cmake +++ /dev/null @@ -1,40 +0,0 @@ -option (ENABLE_REPLXX "Enable replxx support" ${NOT_UNBUNDLED}) - -if (ENABLE_REPLXX) - option (USE_INTERNAL_REPLXX "Use internal replxx library" ${NOT_UNBUNDLED}) - - if (USE_INTERNAL_REPLXX AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/replxx/README.md") - message (WARNING "submodule contrib/replxx is missing. to fix try run: \n git submodule update --init --recursive") - set (USE_INTERNAL_REPLXX 0) - endif () - - if (NOT USE_INTERNAL_REPLXX) - find_library(LIBRARY_REPLXX NAMES replxx replxx-static) - find_path(INCLUDE_REPLXX replxx.hxx) - - add_library(replxx UNKNOWN IMPORTED) - set_property(TARGET replxx PROPERTY IMPORTED_LOCATION ${LIBRARY_REPLXX}) - target_include_directories(replxx PUBLIC ${INCLUDE_REPLXX}) - - set(CMAKE_REQUIRED_LIBRARIES replxx) - check_cxx_source_compiles( - " - #include - int main() { - replxx::Replxx rx; - } - " - EXTERNAL_REPLXX_WORKS - ) - - if (NOT EXTERNAL_REPLXX_WORKS) - message (FATAL_ERROR "replxx is unusable: ${LIBRARY_REPLXX} ${INCLUDE_REPLXX}") - endif () - endif () - - set(USE_REPLXX 1) - - message (STATUS "Using replxx") -else () - set(USE_REPLXX 0) -endif () diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index f81d616cddd..89f12ce0b70 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -332,6 +332,4 @@ if (USE_FASTOPS) add_subdirectory (fastops-cmake) endif() -if (USE_INTERNAL_REPLXX) - add_subdirectory (replxx-cmake) -endif() +add_subdirectory(replxx-cmake) diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 1b27fd53070..1240eb56b39 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -1,18 +1,57 @@ -set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/replxx") +option (ENABLE_REPLXX "Enable replxx support" ${ENABLE_LIBRARIES}) -set(SRCS - ${LIBRARY_DIR}/src/conversion.cxx - ${LIBRARY_DIR}/src/escape.cxx - ${LIBRARY_DIR}/src/history.cxx - ${LIBRARY_DIR}/src/io.cxx - ${LIBRARY_DIR}/src/prompt.cxx - ${LIBRARY_DIR}/src/replxx.cxx - ${LIBRARY_DIR}/src/replxx_impl.cxx - ${LIBRARY_DIR}/src/util.cxx - ${LIBRARY_DIR}/src/wcwidth.cpp - ${LIBRARY_DIR}/src/ConvertUTF.cpp -) +if (ENABLE_REPLXX) + option (USE_INTERNAL_REPLXX "Use internal replxx library" ${NOT_UNBUNDLED}) -add_library(replxx ${SRCS}) -target_include_directories(replxx PUBLIC ${LIBRARY_DIR}/include) -target_compile_options(replxx PUBLIC -Wno-documentation) + if (USE_INTERNAL_REPLXX) + set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/replxx") + + set(SRCS + ${LIBRARY_DIR}/src/conversion.cxx + ${LIBRARY_DIR}/src/ConvertUTF.cpp + ${LIBRARY_DIR}/src/escape.cxx + ${LIBRARY_DIR}/src/history.cxx + ${LIBRARY_DIR}/src/io.cxx + ${LIBRARY_DIR}/src/prompt.cxx + ${LIBRARY_DIR}/src/replxx_impl.cxx + ${LIBRARY_DIR}/src/replxx.cxx + ${LIBRARY_DIR}/src/util.cxx + ${LIBRARY_DIR}/src/wcwidth.cpp + ) + + add_library (replxx ${SRCS}) + target_include_directories(replxx PUBLIC ${LIBRARY_DIR}/include) + else () + find_library(LIBRARY_REPLXX NAMES replxx replxx-static) + find_path(INCLUDE_REPLXX replxx.hxx) + + add_library(replxx UNKNOWN IMPORTED) + set_property(TARGET replxx PROPERTY IMPORTED_LOCATION ${LIBRARY_REPLXX}) + target_include_directories(replxx PUBLIC ${INCLUDE_REPLXX}) + + set(CMAKE_REQUIRED_LIBRARIES replxx) + check_cxx_source_compiles( + " + #include + int main() { + replxx::Replxx rx; + } + " + EXTERNAL_REPLXX_WORKS + ) + + if (NOT EXTERNAL_REPLXX_WORKS) + message (FATAL_ERROR "replxx is unusable: ${LIBRARY_REPLXX} ${INCLUDE_REPLXX}") + endif () + endif () + + target_compile_options(replxx PUBLIC -Wno-documentation) + target_compile_definitions(replxx PUBLIC USE_REPLXX=1) + + message (STATUS "Using replxx") +else () + add_library(replxx INTERFACE) + target_compile_definitions(replxx INTERFACE USE_REPLXX=0) + + message (STATUS "Not using replxx (Beware! Runtime fallback to readline is possible!)") +endif () diff --git a/dbms/programs/client/CMakeLists.txt b/dbms/programs/client/CMakeLists.txt index d4c157ac3b0..11ade559a8d 100644 --- a/dbms/programs/client/CMakeLists.txt +++ b/dbms/programs/client/CMakeLists.txt @@ -4,7 +4,7 @@ set(CLICKHOUSE_CLIENT_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Suggest.cpp ) -set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io clickhouse_parsers string_utils ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY}) +set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io clickhouse_parsers string_utils ${Boost_PROGRAM_OPTIONS_LIBRARY}) include(CheckSymbolExists) check_symbol_exists(readpassphrase readpassphrase.h HAVE_READPASSPHRASE) diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index 76a225e2597..426a6ad884d 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -2,6 +2,12 @@ #include "ConnectionParameters.h" #include "Suggest.h" +#if USE_REPLXX +# include +#else +# include +#endif + #include #include #include @@ -19,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +501,11 @@ private: if (!history_file.empty() && !Poco::File(history_file).exists()) Poco::File(history_file).createFile(); - LineReader lr(&Suggest::instance(), history_file, '\\', config().has("multiline") ? ';' : 0); +#if USE_REPLXX + ReplxxLineReader lr(Suggest::instance(), history_file, '\\', config().has("multiline") ? ';' : 0); +#else + LineReader lr(history_file, '\\', config().has("multiline") ? ';' : 0); +#endif do { diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 550ead28996..65c4f19b7cb 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -61,7 +61,6 @@ const char * auto_config_build[] "USE_HYPERSCAN", "@USE_HYPERSCAN@", "USE_SIMDJSON", "@USE_SIMDJSON@", "USE_POCO_REDIS", "@USE_POCO_REDIS@", - "USE_REPLXX", "@USE_REPLXX@", nullptr, nullptr }; diff --git a/docker/packager/packager b/docker/packager/packager index 62767cae8f0..a31a387d502 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -177,7 +177,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ if unbundled: # TODO: fix build with ENABLE_RDKAFKA - cmake_flags.append('-DUNBUNDLED=1 -DENABLE_MYSQL=0 -DENABLE_POCO_ODBC=0 -DENABLE_ODBC=0 -DENABLE_READLINE=0 -DENABLE_RDKAFKA=0') + cmake_flags.append('-DUNBUNDLED=1 -DENABLE_MYSQL=0 -DENABLE_POCO_ODBC=0 -DENABLE_ODBC=0 -DENABLE_REPLXX=0 -DENABLE_RDKAFKA=0') if split_binary: cmake_flags.append('-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1') diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 312fcc48b13..b83c876978e 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -10,7 +10,7 @@ if (DEFINED APPLE_HAVE_CLOCK_GETTIME) target_compile_definitions(apple_rt PUBLIC -DAPPLE_HAVE_CLOCK_GETTIME=${APPLE_HAVE_CLOCK_GETTIME}) endif () -add_library (common +set (COMMON_SRCS src/argsToConfig.cpp src/coverage.cpp src/DateLUT.cpp @@ -65,7 +65,19 @@ add_library (common include/ext/scope_guard.h include/ext/size.h include/ext/unlock_guard.h +) +if (ENABLE_REPLXX) + set (COMMON_SRCS + src/ReplxxLineReader.cpp + include/common/ReplxxLineReader.h + + ${COMMON_SRCS} + ) +endif () + +add_library (common + ${COMMON_SRCS} ${CONFIG_COMMON}) if (USE_INTERNAL_MEMCPY) @@ -92,8 +104,8 @@ if(CCTZ_LIBRARY) target_link_libraries(common PRIVATE ${CCTZ_LIBRARY}) endif() -if (USE_REPLXX) - target_link_libraries(common PRIVATE replxx) +if (ENABLE_REPLXX) + target_link_libraries(common PUBLIC replxx) endif () target_link_libraries (common diff --git a/libs/libcommon/include/common/LineReader.h b/libs/libcommon/include/common/LineReader.h index 120ff76dac6..aa2954db4fc 100644 --- a/libs/libcommon/include/common/LineReader.h +++ b/libs/libcommon/include/common/LineReader.h @@ -22,8 +22,8 @@ public: WordsRange getCompletions(const String & prefix, size_t prefix_length) const; }; - LineReader(const Suggest * suggest, const String & history_file_path, char extender, char delimiter = 0); /// if delimiter != 0, then it's multiline mode - ~LineReader(); + LineReader(const String & history_file_path, char extender, char delimiter = 0); /// if delimiter != 0, then it's multiline mode + virtual ~LineReader() {} /// Reads the whole line until delimiter (in multiline mode) or until the last line without extender. /// If resulting line is empty, it means the user interrupted the input. @@ -31,7 +31,7 @@ public: /// Typical delimiter is ';' (semicolon) and typical extender is '\' (backslash). String readLine(const String & first_prompt, const String & second_prompt); -private: +protected: enum InputStatus { ABORT = 0, @@ -39,19 +39,17 @@ private: INPUT_LINE, }; - String input; - String prev_line; const String history_file_path; + static constexpr char word_break_characters[] = " \t\n\r\"\\'`@$><=;|&{(."; + + String input; + +private: const char extender; const char delimiter; - InputStatus readOneLine(const String & prompt); - void addToHistory(const String & line); + String prev_line; - /// Since CMake doesn't impose restrictions on includes between unrelated targets - /// it's possible that we include this file without USE_REPLXX defined. -#ifdef __clang__ - [[maybe_unused]] -#endif - void * impl; + virtual InputStatus readOneLine(const String & prompt); + virtual void addToHistory(const String &) {} }; diff --git a/libs/libcommon/include/common/ReplxxLineReader.h b/libs/libcommon/include/common/ReplxxLineReader.h new file mode 100644 index 00000000000..47eabbf9330 --- /dev/null +++ b/libs/libcommon/include/common/ReplxxLineReader.h @@ -0,0 +1,18 @@ +#pragma once + +#include "LineReader.h" + +#include + +class ReplxxLineReader : public LineReader +{ +public: + ReplxxLineReader(const Suggest & suggest, const String & history_file_path, char extender, char delimiter = 0); + ~ReplxxLineReader() override; + +private: + InputStatus readOneLine(const String & prompt) override; + void addToHistory(const String & line) override; + + replxx::Replxx rx; +}; diff --git a/libs/libcommon/include/common/config_common.h.in b/libs/libcommon/include/common/config_common.h.in index 6cee84a5b32..41999bb5cde 100644 --- a/libs/libcommon/include/common/config_common.h.in +++ b/libs/libcommon/include/common/config_common.h.in @@ -3,6 +3,5 @@ // .h autogenerated by cmake ! #cmakedefine01 USE_JEMALLOC -#cmakedefine01 USE_REPLXX #cmakedefine01 UNBUNDLED #cmakedefine01 WITH_COVERAGE diff --git a/libs/libcommon/src/LineReader.cpp b/libs/libcommon/src/LineReader.cpp index 6ac1e856347..4a3a737fe7c 100644 --- a/libs/libcommon/src/LineReader.cpp +++ b/libs/libcommon/src/LineReader.cpp @@ -1,26 +1,20 @@ -#include #include -#if USE_REPLXX -#include -#else - -/// We can detect if code is linked with one or another readline variants or open the library dynamically. -#include -extern "C" -{ - char * readline(const char *) __attribute__((__weak__)); - char * (*readline_ptr)(const char *) = readline; -} - -#endif - #include #include #include #include +#ifdef OS_LINUX +/// We can detect if code is linked with one or another readline variants or open the library dynamically. +# include +extern "C" +{ + char * readline(const char *) __attribute__((__weak__)); + char * (*readline_ptr)(const char *) = readline; +} +#endif namespace { @@ -42,8 +36,6 @@ bool hasInputData() return select(1, &fds, nullptr, nullptr, &timeout) == 1; } -constexpr char word_break_characters[] = " \t\n\r\"\\'`@$><=;|&{(."; - } LineReader::Suggest::WordsRange LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length) const @@ -68,39 +60,12 @@ LineReader::Suggest::WordsRange LineReader::Suggest::getCompletions(const String }); } -LineReader::LineReader(const Suggest * suggest, const String & history_file_path_, char extender_, char delimiter_) +LineReader::LineReader(const String & history_file_path_, char extender_, char delimiter_) : history_file_path(history_file_path_), extender(extender_), delimiter(delimiter_) { -#if USE_REPLXX - impl = new replxx::Replxx; - auto & rx = *(replxx::Replxx*)(impl); - - if (!history_file_path.empty()) - rx.history_load(history_file_path); - - auto callback = [suggest] (const String & context, size_t context_size) - { - auto range = suggest->getCompletions(context, context_size); - return replxx::Replxx::completions_t(range.first, range.second); - }; - - rx.set_completion_callback(callback); - rx.set_complete_on_empty(false); - rx.set_word_break_characters(word_break_characters); -#endif /// FIXME: check extender != delimiter } -LineReader::~LineReader() -{ -#if USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - if (!history_file_path.empty()) - rx.history_save(history_file_path); - delete (replxx::Replxx *)impl; -#endif -} - String LineReader::readLine(const String & first_prompt, const String & second_prompt) { String line; @@ -149,14 +114,7 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) { input.clear(); -#if USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - const char* cinput = rx.input(prompt); - if (cinput == nullptr) - return (errno != EAGAIN) ? ABORT : RESET_LINE; - input = cinput; -#else - +#ifdef OS_LINUX if (!readline_ptr) { for (auto name : {"libreadline.so", "libreadline.so.0", "libeditline.so", "libeditline.so.0"}) @@ -182,22 +140,14 @@ LineReader::InputStatus LineReader::readOneLine(const String & prompt) input = line_read; } else +#endif { std::cout << prompt; std::getline(std::cin, input); if (!std::cin.good()) return ABORT; } -#endif trim(input); return INPUT_LINE; } - -void LineReader::addToHistory(const String & line) -{ -#if USE_REPLXX - auto & rx = *(replxx::Replxx*)(impl); - rx.history_add(line); -#endif -} diff --git a/libs/libcommon/src/ReplxxLineReader.cpp b/libs/libcommon/src/ReplxxLineReader.cpp new file mode 100644 index 00000000000..044ea05413d --- /dev/null +++ b/libs/libcommon/src/ReplxxLineReader.cpp @@ -0,0 +1,57 @@ +#include + +#include +#include +#include + +namespace +{ + +/// Trim ending whitespace inplace +void trim(String & s) +{ + s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end()); +} + +} + +ReplxxLineReader::ReplxxLineReader(const Suggest & suggest, const String & history_file_path_, char extender_, char delimiter_) + : LineReader(history_file_path_, extender_, delimiter_) +{ + if (!history_file_path.empty()) + rx.history_load(history_file_path); + + auto callback = [&suggest] (const String & context, size_t context_size) + { + auto range = suggest.getCompletions(context, context_size); + return replxx::Replxx::completions_t(range.first, range.second); + }; + + rx.set_completion_callback(callback); + rx.set_complete_on_empty(false); + rx.set_word_break_characters(word_break_characters); +} + +ReplxxLineReader::~ReplxxLineReader() +{ + if (!history_file_path.empty()) + rx.history_save(history_file_path); +} + +LineReader::InputStatus ReplxxLineReader::readOneLine(const String & prompt) +{ + input.clear(); + + const char* cinput = rx.input(prompt); + if (cinput == nullptr) + return (errno != EAGAIN) ? ABORT : RESET_LINE; + input = cinput; + + trim(input); + return INPUT_LINE; +} + +void ReplxxLineReader::addToHistory(const String & line) +{ + rx.history_add(line); +} diff --git a/utils/zookeeper-cli/CMakeLists.txt b/utils/zookeeper-cli/CMakeLists.txt index 7c14ed605fb..7e67f078586 100644 --- a/utils/zookeeper-cli/CMakeLists.txt +++ b/utils/zookeeper-cli/CMakeLists.txt @@ -1,3 +1,3 @@ add_executable(clickhouse-zookeeper-cli zookeeper-cli.cpp) -target_link_libraries(clickhouse-zookeeper-cli PRIVATE clickhouse_common_zookeeper ${Poco_Foundation_LIBRARY} ${LINE_EDITING_LIBS}) +target_link_libraries(clickhouse-zookeeper-cli PRIVATE clickhouse_common_zookeeper ${Poco_Foundation_LIBRARY}) INSTALL(TARGETS clickhouse-zookeeper-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse-utils) diff --git a/utils/zookeeper-cli/zookeeper-cli.cpp b/utils/zookeeper-cli/zookeeper-cli.cpp index 5e36ffecdaa..44140423a15 100644 --- a/utils/zookeeper-cli/zookeeper-cli.cpp +++ b/utils/zookeeper-cli/zookeeper-cli.cpp @@ -1,12 +1,13 @@ -#include +#include +#include +#include #include +#include +#include +#include + #include #include -#include -#include -#include -#include -#include void printStat(const Coordination::Stat & s) @@ -69,7 +70,7 @@ int main(int argc, char ** argv) Logger::root().setLevel("trace"); zkutil::ZooKeeper zk(argv[1]); - LineReader lr(nullptr, {}, '\\'); + LineReader lr({}, '\\'); do { From e6958c0e498e595099836651f083e9a639a40775 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 23 Jan 2020 12:01:55 +0300 Subject: [PATCH 59/89] Disable some flappy tests --- dbms/tests/integration/test_multiple_disks/test.py | 6 ++++++ dbms/tests/integration/test_ttl_move/test.py | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/dbms/tests/integration/test_multiple_disks/test.py b/dbms/tests/integration/test_multiple_disks/test.py index d2d14e88b9a..9ccac05b9f4 100644 --- a/dbms/tests/integration/test_multiple_disks/test.py +++ b/dbms/tests/integration/test_multiple_disks/test.py @@ -360,6 +360,7 @@ def test_max_data_part_size(start_cluster, name, engine): finally: node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("mt_with_overflow","MergeTree()"), ("replicated_mt_with_overflow","ReplicatedMergeTree('/clickhouse/replicated_mt_with_overflow', '1')",), @@ -454,6 +455,7 @@ def test_background_move(start_cluster, name, engine): finally: node1.query("DROP TABLE IF EXISTS {name}".format(name=name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("stopped_moving_mt","MergeTree()"), ("stopped_moving_replicated_mt","ReplicatedMergeTree('/clickhouse/stopped_moving_replicated_mt', '1')",), @@ -720,6 +722,7 @@ def produce_alter_move(node, name): pass +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("concurrently_altering_mt","MergeTree()"), ("concurrently_altering_replicated_mt","ReplicatedMergeTree('/clickhouse/concurrently_altering_replicated_mt', '1')",), @@ -773,6 +776,7 @@ def test_concurrent_alter_move(start_cluster, name, engine): finally: node1.query("DROP TABLE IF EXISTS {name}".format(name=name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("concurrently_dropping_mt","MergeTree()"), ("concurrently_dropping_replicated_mt","ReplicatedMergeTree('/clickhouse/concurrently_dropping_replicated_mt', '1')",), @@ -901,6 +905,8 @@ def test_mutate_to_another_disk(start_cluster, name, engine): finally: node1.query("DROP TABLE IF EXISTS {name}".format(name=name)) + +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("alter_modifying_mt","MergeTree()"), ("replicated_alter_modifying_mt","ReplicatedMergeTree('/clickhouse/replicated_alter_modifying_mt', '1')",), diff --git a/dbms/tests/integration/test_ttl_move/test.py b/dbms/tests/integration/test_ttl_move/test.py index 7fabdd85230..b498178e4d7 100644 --- a/dbms/tests/integration/test_ttl_move/test.py +++ b/dbms/tests/integration/test_ttl_move/test.py @@ -50,6 +50,7 @@ def get_used_disks_for_table(node, table_name): return node.query("select disk_name from system.parts where table == '{}' and active=1 order by modification_time".format(table_name)).strip().split('\n') +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,alter", [ ("mt_test_rule_with_invalid_destination","MergeTree()",0), ("replicated_mt_test_rule_with_invalid_destination","ReplicatedMergeTree('/clickhouse/replicated_test_rule_with_invalid_destination', '1')",0), @@ -109,6 +110,7 @@ def test_rule_with_invalid_destination(started_cluster, name, engine, alter): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_inserts_to_disk_do_not_work","MergeTree()",0), ("replicated_mt_test_inserts_to_disk_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_inserts_to_disk_do_not_work', '1')",0), @@ -141,6 +143,7 @@ def test_inserts_to_disk_work(started_cluster, name, engine, positive): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_moves_to_disk_do_not_work","MergeTree()",0), ("replicated_mt_test_moves_to_disk_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_moves_to_disk_do_not_work', '1')",0), @@ -187,6 +190,7 @@ def test_moves_to_disk_work(started_cluster, name, engine, positive): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("mt_test_moves_to_volume_work","MergeTree()"), ("replicated_mt_test_moves_to_volume_work","ReplicatedMergeTree('/clickhouse/replicated_test_moves_to_volume_work', '1')"), @@ -233,6 +237,7 @@ def test_moves_to_volume_work(started_cluster, name, engine): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_inserts_to_volume_do_not_work","MergeTree()",0), ("replicated_mt_test_inserts_to_volume_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_inserts_to_volume_do_not_work', '1')",0), @@ -271,6 +276,7 @@ def test_inserts_to_volume_work(started_cluster, name, engine, positive): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("mt_test_moves_to_disk_eventually_work","MergeTree()"), ("replicated_mt_test_moves_to_disk_eventually_work","ReplicatedMergeTree('/clickhouse/replicated_test_moves_to_disk_eventually_work', '1')"), @@ -326,6 +332,7 @@ def test_moves_to_disk_eventually_work(started_cluster, name, engine): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_merges_to_disk_do_not_work","MergeTree()",0), ("replicated_mt_test_merges_to_disk_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_merges_to_disk_do_not_work', '1')",0), @@ -383,6 +390,7 @@ def test_merges_to_disk_work(started_cluster, name, engine, positive): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("mt_test_merges_with_full_disk_work","MergeTree()"), ("replicated_mt_test_merges_with_full_disk_work","ReplicatedMergeTree('/clickhouse/replicated_test_merges_with_full_disk_work', '1')"), @@ -449,6 +457,7 @@ def test_merges_with_full_disk_work(started_cluster, name, engine): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_moves_after_merges_do_not_work","MergeTree()",0), ("replicated_mt_test_moves_after_merges_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_moves_after_merges_do_not_work', '1')",0), @@ -501,6 +510,7 @@ def test_moves_after_merges_work(started_cluster, name, engine, positive): node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive,bar", [ ("mt_test_moves_after_alter_do_not_work","MergeTree()",0,"DELETE"), ("replicated_mt_test_moves_after_alter_do_not_work","ReplicatedMergeTree('/clickhouse/replicated_test_moves_after_alter_do_not_work', '1')",0,"DELETE"), @@ -544,6 +554,7 @@ def test_ttls_do_not_work_after_alter(started_cluster, name, engine, positive, b node1.query("DROP TABLE IF EXISTS {}".format(name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine,positive", [ ("mt_test_alter_multiple_ttls_positive", "MergeTree()", True), ("mt_replicated_test_alter_multiple_ttls_positive", "ReplicatedMergeTree('/clickhouse/replicated_test_alter_multiple_ttls_positive', '1')", True), @@ -626,6 +637,7 @@ limitations under the License.""" node1.query("DROP TABLE IF EXISTS {name}".format(name=name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,engine", [ ("concurrently_altering_ttl_mt","MergeTree()"), ("concurrently_altering_ttl_replicated_mt","ReplicatedMergeTree('/clickhouse/concurrently_altering_ttl_replicated_mt', '1')",), @@ -716,6 +728,7 @@ def test_concurrent_alter_with_ttl_move(started_cluster, name, engine): finally: node1.query("DROP TABLE IF EXISTS {name}".format(name=name)) +@pytest.mark.skip(reason="Flappy test") @pytest.mark.parametrize("name,positive", [ ("test_double_move_while_select_negative", 0), ("test_double_move_while_select_positive", 1), From 814c4316744aae2b20c9ccef38983f8b2dfd2522 Mon Sep 17 00:00:00 2001 From: Gregory Date: Thu, 23 Jan 2020 12:22:00 +0300 Subject: [PATCH 60/89] Add extern "C" for cgo in arcadia to link [STRM-1049] See https://a.yandex-team.ru/review/1018931/files/ --- contrib/murmurhash/include/murmurhash3.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/murmurhash/include/murmurhash3.h b/contrib/murmurhash/include/murmurhash3.h index 256da1ad9da..eb16425576a 100644 --- a/contrib/murmurhash/include/murmurhash3.h +++ b/contrib/murmurhash/include/murmurhash3.h @@ -23,6 +23,10 @@ typedef unsigned __int64 uint64_t; #endif // !defined(_MSC_VER) +#ifdef __cplusplus +extern "C" { +#endif + //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); @@ -32,3 +36,7 @@ void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); //----------------------------------------------------------------------------- + +#ifdef __cplusplus +} +#endif From f431b10e38116ab9938d4d00cbed040d7b63af43 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 Jan 2020 13:04:18 +0300 Subject: [PATCH 61/89] Update TCPHandler. --- dbms/programs/server/TCPHandler.cpp | 108 ++++++++---------- .../src/Processors/Formats/LazyOutputFormat.h | 8 +- 2 files changed, 52 insertions(+), 64 deletions(-) diff --git a/dbms/programs/server/TCPHandler.cpp b/dbms/programs/server/TCPHandler.cpp index 29bba1cca5e..1975349fcf1 100644 --- a/dbms/programs/server/TCPHandler.cpp +++ b/dbms/programs/server/TCPHandler.cpp @@ -591,11 +591,9 @@ void TCPHandler::processOrdinaryQueryWithProcessors(size_t num_threads) } }); - /// Wait in case of exception. Delete pipeline to release memory. + /// Wait in case of exception happened outside of pool. SCOPE_EXIT( - /// Clear queue in case if somebody is waiting lazy_format to push. lazy_format->finish(); - lazy_format->clearQueue(); try { @@ -604,72 +602,58 @@ void TCPHandler::processOrdinaryQueryWithProcessors(size_t num_threads) catch (...) { /// If exception was thrown during pipeline execution, skip it while processing other exception. + tryLogCurrentException(log); } - - /// pipeline = QueryPipeline() ); - while (true) + while (!lazy_format->isFinished() && !exception) { - Block block; - - while (true) + if (isQueryCancelled()) { - if (isQueryCancelled()) - { - /// A packet was received requesting to stop execution of the request. - executor->cancel(); - - break; - } - else - { - if (after_send_progress.elapsed() / 1000 >= query_context->getSettingsRef().interactive_delay) - { - /// Some time passed and there is a progress. - after_send_progress.restart(); - sendProgress(); - } - - sendLogs(); - - if ((block = lazy_format->getBlock(query_context->getSettingsRef().interactive_delay / 1000))) - break; - - if (lazy_format->isFinished()) - break; - - if (exception) - { - pool.wait(); - break; - } - } - } - - /** If data has run out, we will send the profiling data and total values to - * the last zero block to be able to use - * this information in the suffix output of stream. - * If the request was interrupted, then `sendTotals` and other methods could not be called, - * because we have not read all the data yet, - * and there could be ongoing calculations in other threads at the same time. - */ - if (!block && !isQueryCancelled()) - { - pool.wait(); - pipeline.finalize(); - - sendTotals(lazy_format->getTotals()); - sendExtremes(lazy_format->getExtremes()); - sendProfileInfo(lazy_format->getProfileInfo()); - sendProgress(); - sendLogs(); - } - - sendData(block); - if (!block) + /// A packet was received requesting to stop execution of the request. + executor->cancel(); break; + } + + if (after_send_progress.elapsed() / 1000 >= query_context->getSettingsRef().interactive_delay) + { + /// Some time passed and there is a progress. + after_send_progress.restart(); + sendProgress(); + } + + sendLogs(); + + if (auto block = lazy_format->getBlock(query_context->getSettingsRef().interactive_delay / 1000)) + { + if (!state.io.null_format) + sendData(block); + } } + + /// Finish lazy_format before waiting. Otherwise some thread may write into it, and waiting will lock. + lazy_format->finish(); + pool.wait(); + + /** If data has run out, we will send the profiling data and total values to + * the last zero block to be able to use + * this information in the suffix output of stream. + * If the request was interrupted, then `sendTotals` and other methods could not be called, + * because we have not read all the data yet, + * and there could be ongoing calculations in other threads at the same time. + */ + if (!isQueryCancelled()) + { + pipeline.finalize(); + + sendTotals(lazy_format->getTotals()); + sendExtremes(lazy_format->getExtremes()); + sendProfileInfo(lazy_format->getProfileInfo()); + sendProgress(); + sendLogs(); + } + + sendData({}); } state.io.onFinish(); diff --git a/dbms/src/Processors/Formats/LazyOutputFormat.h b/dbms/src/Processors/Formats/LazyOutputFormat.h index 56aaf249480..a3bc76e839f 100644 --- a/dbms/src/Processors/Formats/LazyOutputFormat.h +++ b/dbms/src/Processors/Formats/LazyOutputFormat.h @@ -26,8 +26,12 @@ public: void setRowsBeforeLimit(size_t rows_before_limit) override; - void finish() { finished_processing = true; } - void clearQueue() { queue.clear(); } + void finish() + { + finished_processing = true; + /// Clear queue in case if somebody is waiting lazy_format to push. + queue.clear(); + } protected: void consume(Chunk chunk) override From a362664cc15c5ac47ad943b004ce1f5ce47504db Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 23 Jan 2020 13:08:17 +0300 Subject: [PATCH 62/89] Better handling of `per_part_columns_lock`. --- dbms/src/Storages/MergeTree/MergeTreeReadPool.cpp | 2 +- dbms/src/Storages/MergeTree/MergeTreeReadPool.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeReadPool.cpp b/dbms/src/Storages/MergeTree/MergeTreeReadPool.cpp index d308667a67b..a70dfc2d78c 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -217,7 +217,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo( per_part_sum_marks.push_back(sum_marks); - per_part_columns_lock.emplace_back(part.data_part->columns_lock); + per_part_columns_lock.emplace_back(part.data_part, part.data_part->columns_lock); auto [required_columns, required_pre_columns, should_reorder] = getReadTaskColumns(data, part.data_part, column_names, prewhere_info, check_columns); diff --git a/dbms/src/Storages/MergeTree/MergeTreeReadPool.h b/dbms/src/Storages/MergeTree/MergeTreeReadPool.h index 2e9cb76f0cd..4151b781d6e 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/dbms/src/Storages/MergeTree/MergeTreeReadPool.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -93,7 +94,7 @@ private: const size_t threads, const size_t sum_marks, std::vector per_part_sum_marks, RangesInDataParts & parts, const size_t min_marks_for_concurrent_read); - std::vector> per_part_columns_lock; + std::vector>> per_part_columns_lock; const MergeTreeData & data; Names column_names; bool do_not_steal_tasks; From bfdab88efbe4b4ff1da14dbabaea8818b8a8bdc9 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Thu, 23 Jan 2020 14:18:47 +0300 Subject: [PATCH 63/89] Restore \G suffix support --- dbms/programs/client/Client.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index 426a6ad884d..6c5f9e5fb30 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -513,6 +513,12 @@ private: if (input.empty()) break; + if (input.ends_with("\\G")) + { + input.resize(input.size() - 2); + has_vertical_output_suffix = true; + } + try { if (!process(input)) From c441c2fc9cb9c7f305961d5ec1a95a7b643882f2 Mon Sep 17 00:00:00 2001 From: millb Date: Thu, 23 Jan 2020 15:47:13 +0300 Subject: [PATCH 64/89] tests fixed --- .../01070_exception_code_in_query_log_table.reference | 4 ++-- .../0_stateless/01070_exception_code_in_query_log_table.sql | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference index ffe29f4e599..1dee767cd4e 100644 --- a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.reference @@ -1,3 +1,3 @@ +0 +0 60 -0 -0 diff --git a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql index f90bf6107d1..9cca089ce08 100644 --- a/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql +++ b/dbms/tests/queries/0_stateless/01070_exception_code_in_query_log_table.sql @@ -3,5 +3,5 @@ SELECT * FROM test_table_for_01070_exception_code_in_query_log_table; -- { serve CREATE TABLE test_table_for_01070_exception_code_in_query_log_table (value UInt64) ENGINE=Memory(); SELECT * FROM test_table_for_01070_exception_code_in_query_log_table; SYSTEM FLUSH LOGS; -SELECT exception_code FROM system.query_log WHERE query='SELECT * FROM test_table_for_01070_exception_code_in_query_log_table'; +SELECT exception_code FROM system.query_log WHERE query='SELECT * FROM test_table_for_01070_exception_code_in_query_log_table' ORDER BY exception_code; DROP TABLE IF EXISTS test_table_for_01070_exception_code_in_query_log_table; From f85e7ca36d43856573950ccde61bc5627e21d5cf Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 Jan 2020 15:53:32 +0300 Subject: [PATCH 65/89] Enable block skip for null format in processors pipeline. --- dbms/src/Interpreters/executeQuery.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index a71450580b7..dda81ae0d47 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -575,14 +575,15 @@ BlockIO executeQuery( BlockIO streams; std::tie(ast, streams) = executeQueryImpl(query.data(), query.data() + query.size(), context, internal, stage, !may_have_embedded_data, nullptr, allow_processors); - if (streams.in) + + if (const auto * ast_query_with_output = dynamic_cast(ast.get())) { - const auto * ast_query_with_output = dynamic_cast(ast.get()); String format_name = ast_query_with_output && (ast_query_with_output->format != nullptr) ? getIdentifierName(ast_query_with_output->format) : context.getDefaultFormat(); if (format_name == "Null") streams.null_format = true; } + return streams; } From b3e33b83304e9b359f414e608ec8ab3f148fe949 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 Jan 2020 15:59:01 +0300 Subject: [PATCH 66/89] Enable block skip for null format in processors pipeline. --- dbms/src/Interpreters/executeQuery.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index dda81ae0d47..7bc8a7b7584 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -578,8 +578,10 @@ BlockIO executeQuery( if (const auto * ast_query_with_output = dynamic_cast(ast.get())) { - String format_name = ast_query_with_output && (ast_query_with_output->format != nullptr) - ? getIdentifierName(ast_query_with_output->format) : context.getDefaultFormat(); + String format_name = ast_query_with_output->format + ? getIdentifierName(ast_query_with_output->format) + : context.getDefaultFormat(); + if (format_name == "Null") streams.null_format = true; } From b5d8baee7107f2e04632579ed51039b2c192b24d Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Wed, 22 Jan 2020 22:52:55 +0300 Subject: [PATCH 67/89] fix alters if ttl is set --- dbms/src/DataStreams/TTLBlockInputStream.cpp | 8 ++--- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 30 +++++++++---------- dbms/src/Storages/MergeTree/MergeTreeData.h | 8 +++-- .../MergeTree/MergeTreeDataWriter.cpp | 4 +-- .../ReplicatedMergeTreeTableMetadata.cpp | 30 ------------------- .../ReplicatedMergeTreeTableMetadata.h | 6 +--- dbms/src/Storages/StorageMergeTree.cpp | 2 +- .../Storages/StorageReplicatedMergeTree.cpp | 2 +- .../01070_alter_with_ttl.reference | 0 .../0_stateless/01070_alter_with_ttl.sql | 12 ++++++++ 10 files changed, 42 insertions(+), 60 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference create mode 100644 dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql diff --git a/dbms/src/DataStreams/TTLBlockInputStream.cpp b/dbms/src/DataStreams/TTLBlockInputStream.cpp index 339f81321e4..c08abba3bdf 100644 --- a/dbms/src/DataStreams/TTLBlockInputStream.cpp +++ b/dbms/src/DataStreams/TTLBlockInputStream.cpp @@ -70,7 +70,7 @@ bool TTLBlockInputStream::isTTLExpired(time_t ttl) Block TTLBlockInputStream::readImpl() { /// Skip all data if table ttl is expired for part - if (storage.hasTableTTL() && isTTLExpired(old_ttl_infos.table_ttl.max)) + if (storage.hasRowsTTL() && isTTLExpired(old_ttl_infos.table_ttl.max)) { rows_removed = data_part->rows_count; return {}; @@ -80,7 +80,7 @@ Block TTLBlockInputStream::readImpl() if (!block) return block; - if (storage.hasTableTTL() && (force || isTTLExpired(old_ttl_infos.table_ttl.min))) + if (storage.hasRowsTTL() && (force || isTTLExpired(old_ttl_infos.table_ttl.min))) removeRowsWithExpiredTableTTL(block); removeValuesWithExpiredColumnTTL(block); @@ -106,10 +106,10 @@ void TTLBlockInputStream::readSuffixImpl() void TTLBlockInputStream::removeRowsWithExpiredTableTTL(Block & block) { - storage.ttl_table_entry.expression->execute(block); + storage.rows_ttl_entry.expression->execute(block); const IColumn * ttl_column = - block.getByName(storage.ttl_table_entry.result_column).column.get(); + block.getByName(storage.rows_ttl_entry.result_column).column.get(); const auto & column_names = header.getNames(); MutableColumns result_columns; diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index 210d412d86f..d1c9e274e38 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -101,6 +101,7 @@ namespace ErrorCodes extern const int UNKNOWN_SETTING; extern const int READONLY_SETTING; extern const int ABORTED; + extern const int UNEXPECTED_AST_STRUCTURE; } @@ -626,7 +627,7 @@ void MergeTreeData::setTTLExpressions(const ColumnsDescription::ColumnTTLs & new { auto new_ttl_entry = create_ttl_entry(ast); if (!only_check) - column_ttl_entries_by_name.emplace(name, new_ttl_entry); + column_ttl_entries_by_name[name] = new_ttl_entry; } } } @@ -634,36 +635,35 @@ void MergeTreeData::setTTLExpressions(const ColumnsDescription::ColumnTTLs & new if (new_ttl_table_ast) { std::vector update_move_ttl_entries; - ASTPtr update_ttl_table_ast = nullptr; - TTLEntry update_ttl_table_entry; + TTLEntry update_rows_ttl_entry; bool seen_delete_ttl = false; for (auto ttl_element_ptr : new_ttl_table_ast->children) { - ASTTTLElement & ttl_element = static_cast(*ttl_element_ptr); - if (ttl_element.destination_type == PartDestinationType::DELETE) + const auto * ttl_element = ttl_element_ptr->as(); + if (!ttl_element) + throw Exception("Unexpected AST element in TTL expression", ErrorCodes::UNEXPECTED_AST_STRUCTURE); + + if (ttl_element->destination_type == PartDestinationType::DELETE) { if (seen_delete_ttl) { throw Exception("More than one DELETE TTL expression is not allowed", ErrorCodes::BAD_TTL_EXPRESSION); } - auto new_ttl_table_entry = create_ttl_entry(ttl_element.children[0]); + auto new_rows_ttl_entry = create_ttl_entry(ttl_element->children[0]); if (!only_check) - { - update_ttl_table_ast = ttl_element.children[0]; - update_ttl_table_entry = new_ttl_table_entry; - } + update_rows_ttl_entry = new_rows_ttl_entry; seen_delete_ttl = true; } else { - auto new_ttl_entry = create_ttl_entry(ttl_element.children[0]); + auto new_ttl_entry = create_ttl_entry(ttl_element->children[0]); new_ttl_entry.entry_ast = ttl_element_ptr; - new_ttl_entry.destination_type = ttl_element.destination_type; - new_ttl_entry.destination_name = ttl_element.destination_name; + new_ttl_entry.destination_type = ttl_element->destination_type; + new_ttl_entry.destination_name = ttl_element->destination_name; if (!new_ttl_entry.getDestination(getStoragePolicy())) { String message; @@ -681,8 +681,8 @@ void MergeTreeData::setTTLExpressions(const ColumnsDescription::ColumnTTLs & new if (!only_check) { - ttl_table_entry = update_ttl_table_entry; - ttl_table_ast = update_ttl_table_ast; + rows_ttl_entry = update_rows_ttl_entry; + ttl_table_ast = new_ttl_table_ast; auto move_ttl_entries_lock = std::lock_guard(move_ttl_entries_mutex); move_ttl_entries = update_move_ttl_entries; diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index 3c051829a61..ba2af73a421 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -576,8 +576,10 @@ public: bool hasSortingKey() const { return !sorting_key_columns.empty(); } bool hasPrimaryKey() const { return !primary_key_columns.empty(); } bool hasSkipIndices() const { return !skip_indices.empty(); } - bool hasTableTTL() const { return ttl_table_ast != nullptr; } + bool hasAnyColumnTTL() const { return !column_ttl_entries_by_name.empty(); } + bool hasAnyMoveTTL() const { return !move_ttl_entries.empty(); } + bool hasRowsTTL() const { return rows_ttl_entry.isEmpty(); } /// Check that the part is not broken and calculate the checksums for it if they are not present. MutableDataPartPtr loadPartAndFixMetadata(const DiskPtr & disk, const String & relative_path); @@ -735,6 +737,8 @@ public: /// Checks if given part already belongs destination disk or volume for this rule. bool isPartInDestination(const StoragePolicyPtr & policy, const MergeTreeDataPart & part) const; + + bool isEmpty() const { return expression != nullptr; } }; std::optional selectTTLEntryForTTLInfos(const MergeTreeDataPart::TTLInfos & ttl_infos, time_t time_of_move) const; @@ -742,7 +746,7 @@ public: using TTLEntriesByName = std::unordered_map; TTLEntriesByName column_ttl_entries_by_name; - TTLEntry ttl_table_entry; + TTLEntry rows_ttl_entry; /// This mutex is required for background move operations which do not obtain global locks. mutable std::mutex move_ttl_entries_mutex; diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 1c8e2e5621a..549345de8d1 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -278,8 +278,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(BlockWithPa ProfileEvents::increment(ProfileEvents::MergeTreeDataWriterBlocksAlreadySorted); } - if (data.hasTableTTL()) - updateTTL(data.ttl_table_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); + if (data.hasRowsTTL()) + updateTTL(data.rows_ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); for (const auto & [name, ttl_entry] : data.column_ttl_entries_by_name) updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp index 703659bb4ea..4ebb51f0b41 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp @@ -55,15 +55,6 @@ ReplicatedMergeTreeTableMetadata::ReplicatedMergeTreeTableMetadata(const MergeTr ttl_table = formattedAST(data.ttl_table_ast); - std::ostringstream ttl_move_stream; - for (const auto & ttl_entry : data.move_ttl_entries) - { - if (ttl_move_stream.tellp() > 0) - ttl_move_stream << ", "; - ttl_move_stream << formattedAST(ttl_entry.entry_ast); - } - ttl_move = ttl_move_stream.str(); - skip_indices = data.getIndices().toString(); if (data.canUseAdaptiveGranularity()) index_granularity_bytes = data_settings->index_granularity_bytes; @@ -95,9 +86,6 @@ void ReplicatedMergeTreeTableMetadata::write(WriteBuffer & out) const if (!ttl_table.empty()) out << "ttl: " << ttl_table << "\n"; - if (!ttl_move.empty()) - out << "move ttl: " << ttl_move << "\n"; - if (!skip_indices.empty()) out << "indices: " << skip_indices << "\n"; @@ -139,9 +127,6 @@ void ReplicatedMergeTreeTableMetadata::read(ReadBuffer & in) if (checkString("ttl: ", in)) in >> ttl_table >> "\n"; - if (checkString("move ttl: ", in)) - in >> ttl_move >> "\n"; - if (checkString("indices: ", in)) in >> skip_indices >> "\n"; @@ -252,21 +237,6 @@ ReplicatedMergeTreeTableMetadata::checkAndFindDiff(const ReplicatedMergeTreeTabl ErrorCodes::METADATA_MISMATCH); } - if (ttl_move != from_zk.ttl_move) - { - if (allow_alter) - { - diff.ttl_move_changed = true; - diff.new_ttl_move = from_zk.ttl_move; - } - else - throw Exception( - "Existing table metadata in ZooKeeper differs in move TTL." - " Stored in ZooKeeper: " + from_zk.ttl_move + - ", local: " + ttl_move, - ErrorCodes::METADATA_MISMATCH); - } - if (skip_indices != from_zk.skip_indices) { if (allow_alter) diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h index 23fc4f6a024..d8af3c2087a 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h @@ -28,7 +28,6 @@ struct ReplicatedMergeTreeTableMetadata String skip_indices; String constraints; String ttl_table; - String ttl_move; UInt64 index_granularity_bytes; ReplicatedMergeTreeTableMetadata() = default; @@ -54,12 +53,9 @@ struct ReplicatedMergeTreeTableMetadata bool ttl_table_changed = false; String new_ttl_table; - bool ttl_move_changed = false; - String new_ttl_move; - bool empty() const { - return !sorting_key_changed && !skip_indices_changed && !ttl_table_changed && !constraints_changed && !ttl_move_changed; + return !sorting_key_changed && !skip_indices_changed && !ttl_table_changed && !constraints_changed; } }; diff --git a/dbms/src/Storages/StorageMergeTree.cpp b/dbms/src/Storages/StorageMergeTree.cpp index f5279ebef92..d8b25627a7e 100644 --- a/dbms/src/Storages/StorageMergeTree.cpp +++ b/dbms/src/Storages/StorageMergeTree.cpp @@ -652,7 +652,7 @@ bool StorageMergeTree::merge( { /// Force filter by TTL in 'OPTIMIZE ... FINAL' query to remove expired values from old parts /// without TTL infos or with outdated TTL infos, e.g. after 'ALTER ... MODIFY TTL' query. - bool force_ttl = (final && (hasTableTTL() || hasAnyColumnTTL())); + bool force_ttl = (final && (hasRowsTTL() || hasAnyColumnTTL())); new_part = merger_mutator.mergePartsToTemporaryPart( future_part, *merge_entry, table_lock_holder, time(nullptr), diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 2fdd7daa684..db113624f68 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -3132,7 +3132,7 @@ bool StorageReplicatedMergeTree::optimize(const ASTPtr & query, const ASTPtr & p return false; }; - bool force_ttl = (final && (hasTableTTL() || hasAnyColumnTTL())); + bool force_ttl = (final && (hasRowsTTL() || hasAnyColumnTTL())); const auto storage_settings_ptr = getSettings(); if (!partition && final) diff --git a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql new file mode 100644 index 00000000000..4c1828f3f9b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql @@ -0,0 +1,12 @@ +drop table if exists alter_ttl; + +create table alter_ttl(i Int) engine = MergeTree order by i ttl toDate('2020-05-05'); +alter table alter_ttl add column s String; +alter table alter_ttl modify column s String ttl toDate('2020-01-01'); +show create table alter_ttl; +drop table alter_ttl; + +create table alter_ttl(d Date, s String) engine = MergeTree order by i ttl d + interval 1 month; +alter table alter_ttl modify column s String ttl d + interval 1 day; +show create table alter_ttl; +drop table alter_ttl; From 1764ba5219f10327752bcc9aef451f6e74acf1aa Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Thu, 23 Jan 2020 16:11:06 +0300 Subject: [PATCH 68/89] test for alter with ttl --- dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference | 2 ++ dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference index e69de29bb2d..308d004ebf7 100644 --- a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference +++ b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.reference @@ -0,0 +1,2 @@ +CREATE TABLE default.alter_ttl (`i` Int, `s` String TTL toDate(\'2020-01-01\')) ENGINE = MergeTree ORDER BY i TTL toDate(\'2020-05-05\') SETTINGS index_granularity = 8192 +CREATE TABLE default.alter_ttl (`d` Date, `s` String TTL d + toIntervalDay(1)) ENGINE = MergeTree ORDER BY d TTL d + toIntervalMonth(1) SETTINGS index_granularity = 8192 diff --git a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql index 4c1828f3f9b..3adc3ccd6ae 100644 --- a/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql +++ b/dbms/tests/queries/0_stateless/01070_alter_with_ttl.sql @@ -6,7 +6,7 @@ alter table alter_ttl modify column s String ttl toDate('2020-01-01'); show create table alter_ttl; drop table alter_ttl; -create table alter_ttl(d Date, s String) engine = MergeTree order by i ttl d + interval 1 month; +create table alter_ttl(d Date, s String) engine = MergeTree order by d ttl d + interval 1 month; alter table alter_ttl modify column s String ttl d + interval 1 day; show create table alter_ttl; drop table alter_ttl; From 6a6ec36d635ec49d156f37e9c91d39620c6dc02f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 23 Jan 2020 16:35:12 +0300 Subject: [PATCH 69/89] Fix race when executing SYSTEM RELOAD ALL DICTIONARIES. --- dbms/src/Interpreters/ExternalLoader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index e9cfe602437..a96f64c110f 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -540,6 +540,7 @@ public: Strings getAllTriedToLoadNames() const { + std::lock_guard lock{mutex}; Strings names; for (auto & [name, info] : infos) if (info.triedToLoad()) From 97c67e0e0a2e2c9408104172f322a48c274db4e8 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 23 Jan 2020 17:33:48 +0300 Subject: [PATCH 70/89] Update AUTHORS --- AUTHORS | 44 +------------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/AUTHORS b/AUTHORS index db769ac16ce..9b046d2f864 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,43 +1 @@ -The following authors have created the source code of "ClickHouse" -published and distributed by YANDEX LLC as the owner: - -Alexander Makarov -Alexander Prudaev -Alexey Arno -Alexey Milovidov -Alexey Tronov -Alexey Vasiliev -Alexey Zatelepin -Amy Krishnevsky -Andrey M -Andrey Mironov -Andrey Urusov -Anton Tikhonov -Dmitry Bilunov -Dmitry Galuza -Eugene Konkov -Evgeniy Gatov -Ilya Khomutov -Ilya Korolev -Ivan Blinkov -Maxim Nikulin -Michael Kolupaev -Michael Razuvaev -Nikolai Kochetov -Nikolay Vasiliev -Nikolay Volosatov -Pavel Artemkin -Pavel Kartaviy -Roman Nozdrin -Roman Peshkurov -Sergey Fedorov -Sergey Lazarev -Sergey Magidovich -Sergey Serebryanik -Sergey Veletskiy -Vasily Okunev -Vitaliy Lyudvichenko -Vladimir Chebotarev -Vsevolod Orlov -Vyacheslav Alipov -Yuriy Galitskiy +To see the list of authors who created the source code of ClickHouse, published and distributed by YANDEX LLC as the owner, run "SELECT * FROM system.contributors;" query on any ClickHouse server. From c6e7c29eeff14d94695ac133113d59e9ded159a4 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 23 Jan 2020 17:36:05 +0300 Subject: [PATCH 71/89] Update AUTHORS --- AUTHORS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index 9b046d2f864..12838d7fa14 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1 +1,2 @@ -To see the list of authors who created the source code of ClickHouse, published and distributed by YANDEX LLC as the owner, run "SELECT * FROM system.contributors;" query on any ClickHouse server. +To see the list of authors who created the source code of ClickHouse, published and distributed by YANDEX LLC as the owner, +run "SELECT * FROM system.contributors;" query on any ClickHouse server. From 9106fc74b4b4a048795d611baae57ecbaef9d8c8 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 23 Jan 2020 17:38:30 +0300 Subject: [PATCH 72/89] Update LICENSE --- LICENSE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index ef36c40c4b0..f79538892b8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2016-2019 Yandex LLC +Copyright 2016-2020 Yandex LLC Apache License Version 2.0, January 2004 @@ -188,7 +188,7 @@ Copyright 2016-2019 Yandex LLC same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2016-2019 Yandex LLC + Copyright 2016-2020 Yandex LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 07fc58771d2b06cde38f4489a43bb14cf784c318 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 23 Jan 2020 17:41:05 +0300 Subject: [PATCH 73/89] Update index.html --- website/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/index.html b/website/index.html index ec686bceefb..fa9abdda140 100644 --- a/website/index.html +++ b/website/index.html @@ -501,7 +501,7 @@ sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh ClickHouse source code is published under Apache 2.0 License. Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- + From 405f9f0696a0ce650dbfa7328c39ba4d44cc0575 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 Jan 2020 17:58:50 +0300 Subject: [PATCH 74/89] Fix LazyOutput. --- dbms/src/Processors/Formats/LazyOutputFormat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Processors/Formats/LazyOutputFormat.h b/dbms/src/Processors/Formats/LazyOutputFormat.h index a3bc76e839f..441a3449620 100644 --- a/dbms/src/Processors/Formats/LazyOutputFormat.h +++ b/dbms/src/Processors/Formats/LazyOutputFormat.h @@ -20,7 +20,7 @@ public: Block getTotals(); Block getExtremes(); - bool isFinished() { return finished_processing; } + bool isFinished() { return finished_processing && queue.size() == 0; } BlockStreamProfileInfo & getProfileInfo() { return info; } From 88bfb788a94eeb4756a30b18cb3811ce67fb1e37 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 19 Jan 2020 17:26:28 +0300 Subject: [PATCH 75/89] Add ability to use multiple disks/volumes for temporary data This patch adds config directive, that will define the policy to use for storing temporary files, if it is not set (default) the will be used. Also tmp_policy has some limitations: - move_factor is ignored - keep_free_space_bytes is ignored - max_data_part_size_bytes is ignored - must have exactly one volume --- dbms/programs/local/LocalServer.cpp | 2 +- dbms/programs/server/HTTPHandler.cpp | 4 +- dbms/programs/server/Server.cpp | 45 +++++++++++------ dbms/programs/server/config.xml | 11 +++++ .../MergeSortingBlockInputStream.cpp | 13 +++-- .../MergeSortingBlockInputStream.h | 7 ++- dbms/src/Disks/DiskSpaceMonitor.cpp | 6 +++ dbms/src/Disks/DiskSpaceMonitor.h | 7 +++ dbms/src/Interpreters/Aggregator.cpp | 18 +++++-- dbms/src/Interpreters/Aggregator.h | 11 +++-- dbms/src/Interpreters/AnalyzedJoin.cpp | 4 +- dbms/src/Interpreters/AnalyzedJoin.h | 9 ++-- dbms/src/Interpreters/Context.cpp | 49 ++++++++++++++----- dbms/src/Interpreters/Context.h | 9 +++- .../Interpreters/InterpreterSelectQuery.cpp | 13 ++--- dbms/src/Interpreters/MergeJoin.cpp | 6 ++- dbms/src/Interpreters/MergeJoin.h | 9 ++-- dbms/src/Interpreters/SyntaxAnalyzer.cpp | 2 +- dbms/src/Interpreters/tests/aggregate.cpp | 2 +- .../Transforms/MergeSortingTransform.cpp | 13 +++-- .../Transforms/MergeSortingTransform.h | 7 ++- .../tests/processors_test_aggregation.cpp | 8 ++- ...rocessors_test_merge_sorting_transform.cpp | 11 ++++- 23 files changed, 194 insertions(+), 72 deletions(-) diff --git a/dbms/programs/local/LocalServer.cpp b/dbms/programs/local/LocalServer.cpp index cac561117b4..5cfceaeb592 100644 --- a/dbms/programs/local/LocalServer.cpp +++ b/dbms/programs/local/LocalServer.cpp @@ -111,7 +111,7 @@ void LocalServer::tryInitPath() /// In case of empty path set paths to helpful directories std::string cd = Poco::Path::current(); - context->setTemporaryPath(cd + "tmp"); + context->setTemporaryStorage(cd + "tmp"); context->setFlagsPath(cd + "flags"); context->setUserFilesPath(""); // user's files are everywhere } diff --git a/dbms/programs/server/HTTPHandler.cpp b/dbms/programs/server/HTTPHandler.cpp index b2b3298693e..4df7f9f263f 100644 --- a/dbms/programs/server/HTTPHandler.cpp +++ b/dbms/programs/server/HTTPHandler.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -351,7 +352,8 @@ void HTTPHandler::processQuery( if (buffer_until_eof) { - std::string tmp_path_template = context.getTemporaryPath() + "http_buffers/"; + const std::string tmp_path(context.getTemporaryVolume()->getNextDisk()->getPath()); + const std::string tmp_path_template(tmp_path + "http_buffers/"); auto create_tmp_disk_buffer = [tmp_path_template] (const WriteBufferPtr &) { diff --git a/dbms/programs/server/Server.cpp b/dbms/programs/server/Server.cpp index 3ff943d519e..a6b361f90c2 100644 --- a/dbms/programs/server/Server.cpp +++ b/dbms/programs/server/Server.cpp @@ -77,6 +77,31 @@ namespace CurrentMetrics extern const Metric VersionInteger; } +namespace +{ + +void setupTmpPath(Logger * log, const std::string & path) +{ + LOG_DEBUG(log, "Setting up " << path << " to store temporary data in it"); + + Poco::File(path).createDirectories(); + + /// Clearing old temporary files. + Poco::DirectoryIterator dir_end; + for (Poco::DirectoryIterator it(path); it != dir_end; ++it) + { + if (it->isFile() && startsWith(it.name(), "tmp")) + { + LOG_DEBUG(log, "Removing old temporary file " << it->path()); + it->remove(); + } + else + LOG_DEBUG(log, "Skipped file in temporary path " << it->path()); + } +} + +} + namespace DB { @@ -331,22 +356,14 @@ int Server::main(const std::vector & /*args*/) DateLUT::instance(); LOG_TRACE(log, "Initialized DateLUT with time zone '" << DateLUT::instance().getTimeZone() << "'."); - /// Directory with temporary data for processing of heavy queries. + + /// Storage with temporary data for processing of heavy queries. { std::string tmp_path = config().getString("tmp_path", path + "tmp/"); - global_context->setTemporaryPath(tmp_path); - Poco::File(tmp_path).createDirectories(); - - /// Clearing old temporary files. - Poco::DirectoryIterator dir_end; - for (Poco::DirectoryIterator it(tmp_path); it != dir_end; ++it) - { - if (it->isFile() && startsWith(it.name(), "tmp")) - { - LOG_DEBUG(log, "Removing old temporary file " << it->path()); - it->remove(); - } - } + std::string tmp_policy = config().getString("tmp_policy", ""); + const VolumePtr & volume = global_context->setTemporaryStorage(tmp_path, tmp_policy); + for (const DiskPtr & disk : volume->disks) + setupTmpPath(log, disk->getPath()); } /** Directory with 'flags': files indicating temporary settings for the server set by system administrator. diff --git a/dbms/programs/server/config.xml b/dbms/programs/server/config.xml index 80759a5a53e..e8ce1c5b688 100644 --- a/dbms/programs/server/config.xml +++ b/dbms/programs/server/config.xml @@ -133,6 +133,17 @@ /var/lib/clickhouse/tmp/ + + + /var/lib/clickhouse/user_files/ diff --git a/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp b/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp index 52f85f1349c..21422d0fe54 100644 --- a/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp +++ b/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace ProfileEvents @@ -21,10 +22,10 @@ namespace DB MergeSortingBlockInputStream::MergeSortingBlockInputStream( const BlockInputStreamPtr & input, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_, size_t max_bytes_before_remerge_, - size_t max_bytes_before_external_sort_, const std::string & tmp_path_, size_t min_free_disk_space_) + size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, size_t min_free_disk_space_) : description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_), max_bytes_before_remerge(max_bytes_before_remerge_), - max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_path(tmp_path_), + max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_volume(tmp_volume_), min_free_disk_space(min_free_disk_space_) { children.push_back(input); @@ -78,10 +79,14 @@ Block MergeSortingBlockInputStream::readImpl() */ if (max_bytes_before_external_sort && sum_bytes_in_blocks > max_bytes_before_external_sort) { - if (!enoughSpaceInDirectory(tmp_path, sum_bytes_in_blocks + min_free_disk_space)) - throw Exception("Not enough space for external sort in " + tmp_path, ErrorCodes::NOT_ENOUGH_SPACE); + size_t size = sum_bytes_in_blocks + min_free_disk_space; + auto reservation = tmp_volume->reserve(size); + if (!reservation) + throw Exception("Not enough space for external sort in temporary storage", ErrorCodes::NOT_ENOUGH_SPACE); + const std::string tmp_path(reservation->getDisk()->getPath()); temporary_files.emplace_back(createTemporaryFile(tmp_path)); + const std::string & path = temporary_files.back()->path(); MergeSortingBlocksBlockInputStream block_in(blocks, description, max_merged_block_size, limit); diff --git a/dbms/src/DataStreams/MergeSortingBlockInputStream.h b/dbms/src/DataStreams/MergeSortingBlockInputStream.h index ce82f6bb120..5b157310765 100644 --- a/dbms/src/DataStreams/MergeSortingBlockInputStream.h +++ b/dbms/src/DataStreams/MergeSortingBlockInputStream.h @@ -18,6 +18,9 @@ namespace DB struct TemporaryFileStream; +class Volume; +using VolumePtr = std::shared_ptr; + namespace ErrorCodes { extern const int NOT_ENOUGH_SPACE; @@ -77,7 +80,7 @@ public: MergeSortingBlockInputStream(const BlockInputStreamPtr & input, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_, size_t max_bytes_before_remerge_, - size_t max_bytes_before_external_sort_, const std::string & tmp_path_, + size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, size_t min_free_disk_space_); String getName() const override { return "MergeSorting"; } @@ -97,7 +100,7 @@ private: size_t max_bytes_before_remerge; size_t max_bytes_before_external_sort; - const std::string tmp_path; + VolumePtr tmp_volume; size_t min_free_disk_space; Logger * log = &Logger::get("MergeSortingBlockInputStream"); diff --git a/dbms/src/Disks/DiskSpaceMonitor.cpp b/dbms/src/Disks/DiskSpaceMonitor.cpp index 59b8c21119a..6cc6d7e04db 100644 --- a/dbms/src/Disks/DiskSpaceMonitor.cpp +++ b/dbms/src/Disks/DiskSpaceMonitor.cpp @@ -111,6 +111,12 @@ Volume::Volume( << " < " << formatReadableSizeWithBinarySuffix(MIN_PART_SIZE) << ")"); } +DiskPtr Volume::getNextDisk() +{ + size_t start_from = last_used.fetch_add(1u, std::memory_order_relaxed); + size_t index = start_from % disks.size(); + return disks[index]; +} ReservationPtr Volume::reserve(UInt64 expected_size) { diff --git a/dbms/src/Disks/DiskSpaceMonitor.h b/dbms/src/Disks/DiskSpaceMonitor.h index 3d2216b545b..cb00944e149 100644 --- a/dbms/src/Disks/DiskSpaceMonitor.h +++ b/dbms/src/Disks/DiskSpaceMonitor.h @@ -67,6 +67,13 @@ public: const String & config_prefix, const DiskSelector & disk_selector); + /// Next disk (round-robin) + /// + /// - Used with policy for temporary data + /// - Ignores all limitations + /// - Shares last access with reserve() + DiskPtr getNextDisk(); + /// Uses Round-robin to choose disk for reservation. /// Returns valid reservation or nullptr if there is no space left on any disk. ReservationPtr reserve(UInt64 bytes) override; diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 39b5175722c..8118f2d1c54 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace ProfileEvents @@ -681,22 +682,25 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData && current_memory_usage > static_cast(params.max_bytes_before_external_group_by) && worth_convert_to_two_level) { - if (!enoughSpaceInDirectory(params.tmp_path, current_memory_usage + params.min_free_disk_space)) - throw Exception("Not enough space for external aggregation in " + params.tmp_path, ErrorCodes::NOT_ENOUGH_SPACE); + size_t size = current_memory_usage + params.min_free_disk_space; + auto reservation = params.tmp_volume->reserve(size); + if (!reservation) + throw Exception("Not enough space for external aggregation in temporary storage", ErrorCodes::NOT_ENOUGH_SPACE); - writeToTemporaryFile(result); + const std::string tmp_path(reservation->getDisk()->getPath()); + writeToTemporaryFile(result, tmp_path); } return true; } -void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants) +void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path) { Stopwatch watch; size_t rows = data_variants.size(); - auto file = createTemporaryFile(params.tmp_path); + auto file = createTemporaryFile(tmp_path); const std::string & path = file->path(); WriteBufferFromFile file_buf(path); CompressedWriteBuffer compressed_buf(file_buf); @@ -753,6 +757,10 @@ void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants) << (uncompressed_bytes / elapsed_seconds / 1048576.0) << " MiB/sec. uncompressed, " << (compressed_bytes / elapsed_seconds / 1048576.0) << " MiB/sec. compressed)"); } +void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants) +{ + return writeToTemporaryFile(data_variants, params.tmp_volume->getNextDisk()->getPath()); +} template diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index ce2872714cb..cdb1b96f4e8 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -46,6 +46,8 @@ namespace ErrorCodes class IBlockOutputStream; +class Volume; +using VolumePtr = std::shared_ptr; /** Different data structures that can be used for aggregation * For efficiency, the aggregation data itself is put into the pool. @@ -860,7 +862,7 @@ public: /// Return empty result when aggregating without keys on empty set. bool empty_result_for_aggregation_by_empty_set; - const std::string tmp_path; + VolumePtr tmp_volume; /// Settings is used to determine cache size. No threads are created. size_t max_threads; @@ -873,7 +875,7 @@ public: size_t group_by_two_level_threshold_, size_t group_by_two_level_threshold_bytes_, size_t max_bytes_before_external_group_by_, bool empty_result_for_aggregation_by_empty_set_, - const std::string & tmp_path_, size_t max_threads_, + VolumePtr tmp_volume_, size_t max_threads_, size_t min_free_disk_space_) : src_header(src_header_), keys(keys_), aggregates(aggregates_), keys_size(keys.size()), aggregates_size(aggregates.size()), @@ -881,7 +883,7 @@ public: group_by_two_level_threshold(group_by_two_level_threshold_), group_by_two_level_threshold_bytes(group_by_two_level_threshold_bytes_), max_bytes_before_external_group_by(max_bytes_before_external_group_by_), empty_result_for_aggregation_by_empty_set(empty_result_for_aggregation_by_empty_set_), - tmp_path(tmp_path_), max_threads(max_threads_), + tmp_volume(tmp_volume_), max_threads(max_threads_), min_free_disk_space(min_free_disk_space_) { } @@ -889,7 +891,7 @@ public: /// Only parameters that matter during merge. Params(const Block & intermediate_header_, const ColumnNumbers & keys_, const AggregateDescriptions & aggregates_, bool overflow_row_, size_t max_threads_) - : Params(Block(), keys_, aggregates_, overflow_row_, 0, OverflowMode::THROW, 0, 0, 0, false, "", max_threads_, 0) + : Params(Block(), keys_, aggregates_, overflow_row_, 0, OverflowMode::THROW, 0, 0, 0, false, nullptr, max_threads_, 0) { intermediate_header = intermediate_header_; } @@ -955,6 +957,7 @@ public: void setCancellationHook(const CancellationHook cancellation_hook); /// For external aggregation. + void writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path); void writeToTemporaryFile(AggregatedDataVariants & data_variants); bool hasTemporaryFiles() const { return !temporary_files.empty(); } diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 5e4bf1fe53b..74bd227bedd 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -19,14 +19,14 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; } -AnalyzedJoin::AnalyzedJoin(const Settings & settings, const String & tmp_path_) +AnalyzedJoin::AnalyzedJoin(const Settings & settings, VolumePtr tmp_volume_) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , default_max_bytes(settings.default_max_bytes_in_join) , join_use_nulls(settings.join_use_nulls) , partial_merge_join(settings.partial_merge_join) , partial_merge_join_optimizations(settings.partial_merge_join_optimizations) , partial_merge_join_rows_in_right_blocks(settings.partial_merge_join_rows_in_right_blocks) - , tmp_path(tmp_path_) + , tmp_volume(tmp_volume_) {} void AnalyzedJoin::addUsingKey(const ASTPtr & ast) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 677662d949c..c521c3233d3 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -21,6 +21,9 @@ class Block; struct Settings; +class Volume; +using VolumePtr = std::shared_ptr; + class AnalyzedJoin { /** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k` @@ -61,10 +64,10 @@ class AnalyzedJoin /// Original name -> name. Only ranamed columns. std::unordered_map renames; - String tmp_path; + VolumePtr tmp_volume; public: - AnalyzedJoin(const Settings &, const String & tmp_path); + AnalyzedJoin(const Settings &, VolumePtr tmp_volume); /// for StorageJoin AnalyzedJoin(SizeLimits limits, bool use_nulls, ASTTableJoin::Kind kind, ASTTableJoin::Strictness strictness, @@ -81,7 +84,7 @@ public: ASTTableJoin::Kind kind() const { return table_join.kind; } ASTTableJoin::Strictness strictness() const { return table_join.strictness; } const SizeLimits & sizeLimits() const { return size_limits; } - const String & getTemporaryPath() const { return tmp_path; } + VolumePtr getTemporaryVolume() { return tmp_volume; } bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } bool forceNullableLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 66ce18aa2c4..04d01a24cc1 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,7 @@ namespace ErrorCodes extern const int SCALAR_ALREADY_EXISTS; extern const int UNKNOWN_SCALAR; extern const int NOT_ENOUGH_PRIVILEGES; + extern const int UNKNOWN_POLICY; } @@ -123,12 +125,14 @@ struct ContextShared String interserver_scheme; /// http or https String path; /// Path to the data directory, with a slash at the end. - String tmp_path; /// The path to the temporary files that occur when processing the request. String flags_path; /// Path to the directory with some control flags for server maintenance. String user_files_path; /// Path to the directory with user provided files, usable by 'file' table function. String dictionaries_lib_path; /// Path to the directory with user provided binaries and libraries for external dictionaries. ConfigurationPtr config; /// Global configuration settings. + String tmp_path; /// Path to the temporary files that occur when processing the request. + mutable VolumePtr tmp_volume; /// Volume for the the temporary files that occur when processing the request. + Databases databases; /// List of databases and tables in them. mutable std::optional embedded_dictionaries; /// Metrica's dictionaries. Have lazy initialization. mutable std::optional external_dictionaries_loader; @@ -151,9 +155,9 @@ struct ContextShared std::unique_ptr ddl_worker; /// Process ddl commands from zk. /// Rules for selecting the compression settings, depending on the size of the part. mutable std::unique_ptr compression_codec_selector; - /// Storage disk chooser + /// Storage disk chooser for MergeTree engines mutable std::unique_ptr merge_tree_disk_selector; - /// Storage policy chooser + /// Storage policy chooser for MergeTree engines mutable std::unique_ptr merge_tree_storage_policy_selector; std::optional merge_tree_settings; /// Settings of MergeTree* engines. @@ -527,12 +531,6 @@ String Context::getPath() const return shared->path; } -String Context::getTemporaryPath() const -{ - auto lock = getLock(); - return shared->tmp_path; -} - String Context::getFlagsPath() const { auto lock = getLock(); @@ -551,13 +549,19 @@ String Context::getDictionariesLibPath() const return shared->dictionaries_lib_path; } +VolumePtr Context::getTemporaryVolume() const +{ + auto lock = getLock(); + return shared->tmp_volume; +} + void Context::setPath(const String & path) { auto lock = getLock(); shared->path = path; - if (shared->tmp_path.empty()) + if (shared->tmp_path.empty() && !shared->tmp_volume) shared->tmp_path = shared->path + "tmp/"; if (shared->flags_path.empty()) @@ -570,10 +574,31 @@ void Context::setPath(const String & path) shared->dictionaries_lib_path = shared->path + "dictionaries_lib/"; } -void Context::setTemporaryPath(const String & path) +VolumePtr Context::setTemporaryStorage(const String & path, const String & policy_name) { auto lock = getLock(); - shared->tmp_path = path; + + if (policy_name.empty()) + { + shared->tmp_path = path; + if (!shared->tmp_path.ends_with('/')) + shared->tmp_path += '/'; + + auto disk = std::make_shared("_tmp_default", shared->tmp_path, 0); + shared->tmp_volume = std::make_shared("_tmp_default", std::vector{disk}, 0); + } + else + { + StoragePolicyPtr tmp_policy = getStoragePolicySelector()[policy_name]; + if (tmp_policy->getVolumes().size() != 1) + throw Exception("Policy " + policy_name + " is used temporary files, such policy should have exactly one volume", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + shared->tmp_volume = tmp_policy->getVolume(0); + } + + if (!shared->tmp_volume->disks.size()) + throw Exception("No disks volume for temporary files", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + return shared->tmp_volume; } void Context::setFlagsPath(const String & path) diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index 1af87b527ad..dcce1a4772f 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -91,6 +91,9 @@ class StoragePolicySelector; class IOutputFormat; using OutputFormatPtr = std::shared_ptr; +class Volume; +using VolumePtr = std::shared_ptr; + #if USE_EMBEDDED_COMPILER class CompiledExpressionCache; @@ -195,17 +198,19 @@ public: ~Context(); String getPath() const; - String getTemporaryPath() const; String getFlagsPath() const; String getUserFilesPath() const; String getDictionariesLibPath() const; + VolumePtr getTemporaryVolume() const; + void setPath(const String & path); - void setTemporaryPath(const String & path); void setFlagsPath(const String & path); void setUserFilesPath(const String & path); void setDictionariesLibPath(const String & path); + VolumePtr setTemporaryStorage(const String & path, const String & policy_name = ""); + using ConfigurationPtr = Poco::AutoPtr; /// Global application configuration settings. diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index 41f888f898b..6c32da6ff58 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -1873,7 +1873,7 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold : SettingUInt64(0), allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold_bytes : SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryPath(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); + context->getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); /// If there are several sources, then we perform parallel aggregation if (pipeline.streams.size() > 1) @@ -1939,7 +1939,7 @@ void InterpreterSelectQuery::executeAggregation(QueryPipeline & pipeline, const allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold : SettingUInt64(0), allow_to_use_two_level_group_by ? settings.group_by_two_level_threshold_bytes : SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryPath(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); + context->getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); auto transform_params = std::make_shared(params, final); @@ -2165,7 +2165,7 @@ void InterpreterSelectQuery::executeRollupOrCube(Pipeline & pipeline, Modificato false, settings.max_rows_to_group_by, settings.group_by_overflow_mode, SettingUInt64(0), SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryPath(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); + context->getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); if (modificator == Modificator::ROLLUP) pipeline.firstStream() = std::make_shared(pipeline.firstStream(), params); @@ -2194,7 +2194,7 @@ void InterpreterSelectQuery::executeRollupOrCube(QueryPipeline & pipeline, Modif false, settings.max_rows_to_group_by, settings.group_by_overflow_mode, SettingUInt64(0), SettingUInt64(0), settings.max_bytes_before_external_group_by, settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryPath(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); + context->getTemporaryVolume(), settings.max_threads, settings.min_free_disk_space_for_temporary_data); auto transform_params = std::make_shared(params, true); @@ -2278,7 +2278,7 @@ void InterpreterSelectQuery::executeOrder(Pipeline & pipeline, InputSortingInfoP sorting_stream, output_order_descr, settings.max_block_size, limit, settings.max_bytes_before_remerge_sort, settings.max_bytes_before_external_sort / pipeline.streams.size(), - context->getTemporaryPath(), settings.min_free_disk_space_for_temporary_data); + context->getTemporaryVolume(), settings.min_free_disk_space_for_temporary_data); stream = merging_stream; }); @@ -2360,7 +2360,8 @@ void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputSorting return std::make_shared( header, output_order_descr, settings.max_block_size, limit, settings.max_bytes_before_remerge_sort / pipeline.getNumStreams(), - settings.max_bytes_before_external_sort, context->getTemporaryPath(), settings.min_free_disk_space_for_temporary_data); + settings.max_bytes_before_external_sort, context->getTemporaryVolume(), + settings.min_free_disk_space_for_temporary_data); }); /// If there are several streams, we merge them into one diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index f301de17bc5..6ed7de9c898 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB { @@ -386,6 +387,8 @@ void MiniLSM::insert(const BlocksList & blocks) if (blocks.empty()) return; + const std::string path(volume->getNextDisk()->getPath()); + SortedFiles sorted_blocks; if (blocks.size() > 1) { @@ -414,6 +417,7 @@ void MiniLSM::merge(std::function callback) BlockInputStreams inputs = makeSortedInputStreams(sorted_files, sample_block); MergingSortedBlockInputStream sorted_stream(inputs, sort_description, rows_in_block); + const std::string path(volume->getNextDisk()->getPath()); SortedFiles out; flushStreamToFiles(path, sample_block, sorted_stream, out, callback); @@ -463,7 +467,7 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & ri makeSortAndMerge(table_join->keyNamesLeft(), left_sort_description, left_merge_description); makeSortAndMerge(table_join->keyNamesRight(), right_sort_description, right_merge_description); - lsm = std::make_unique(table_join->getTemporaryPath(), right_sample_block, right_sort_description, max_rows_in_right_block); + lsm = std::make_unique(table_join->getTemporaryVolume(), right_sample_block, right_sort_description, max_rows_in_right_block); } void MergeJoin::setTotals(const Block & totals_block) diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 960ca31153d..83fbe9ffc03 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -17,20 +17,23 @@ class AnalyzedJoin; class MergeJoinCursor; struct MergeJoinEqualRange; +class Volume; +using VolumePtr = std::shared_ptr; + struct MiniLSM { using SortedFiles = std::vector>; - const String & path; + VolumePtr volume; const Block & sample_block; const SortDescription & sort_description; const size_t rows_in_block; const size_t max_size; std::vector sorted_files; - MiniLSM(const String & path_, const Block & sample_block_, const SortDescription & description, + MiniLSM(VolumePtr volume_, const Block & sample_block_, const SortDescription & description, size_t rows_in_block_, size_t max_size_ = 16) - : path(path_) + : volume(volume_) , sample_block(sample_block_) , sort_description(description) , rows_in_block(rows_in_block_) diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index a485bd7ad73..b929804d0ae 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -816,7 +816,7 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze( SyntaxAnalyzerResult result; result.storage = storage; result.source_columns = source_columns_; - result.analyzed_join = std::make_shared(settings, context.getTemporaryPath()); /// TODO: move to select_query logic + result.analyzed_join = std::make_shared(settings, context.getTemporaryVolume()); /// TODO: move to select_query logic if (storage) collectSourceColumns(storage->getColumns(), result.source_columns, (select_query != nullptr)); diff --git a/dbms/src/Interpreters/tests/aggregate.cpp b/dbms/src/Interpreters/tests/aggregate.cpp index 4d4d964aa9a..df498d6039d 100644 --- a/dbms/src/Interpreters/tests/aggregate.cpp +++ b/dbms/src/Interpreters/tests/aggregate.cpp @@ -79,7 +79,7 @@ int main(int argc, char ** argv) Aggregator::Params params( stream->getHeader(), {0, 1}, aggregate_descriptions, - false, 0, OverflowMode::THROW, 0, 0, 0, false, "", 1, 0); + false, 0, OverflowMode::THROW, 0, 0, 0, false, nullptr, 1, 0); Aggregator aggregator(params); diff --git a/dbms/src/Processors/Transforms/MergeSortingTransform.cpp b/dbms/src/Processors/Transforms/MergeSortingTransform.cpp index 39da24ba149..060d860b0b5 100644 --- a/dbms/src/Processors/Transforms/MergeSortingTransform.cpp +++ b/dbms/src/Processors/Transforms/MergeSortingTransform.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace ProfileEvents @@ -95,11 +96,11 @@ MergeSortingTransform::MergeSortingTransform( const SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_, size_t max_bytes_before_remerge_, - size_t max_bytes_before_external_sort_, const std::string & tmp_path_, + size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, size_t min_free_disk_space_) : SortingTransform(header, description_, max_merged_block_size_, limit_) , max_bytes_before_remerge(max_bytes_before_remerge_) - , max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_path(tmp_path_) + , max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_volume(tmp_volume_) , min_free_disk_space(min_free_disk_space_) {} Processors MergeSortingTransform::expandPipeline() @@ -172,10 +173,14 @@ void MergeSortingTransform::consume(Chunk chunk) */ if (max_bytes_before_external_sort && sum_bytes_in_blocks > max_bytes_before_external_sort) { - if (!enoughSpaceInDirectory(tmp_path, sum_bytes_in_blocks + min_free_disk_space)) - throw Exception("Not enough space for external sort in " + tmp_path, ErrorCodes::NOT_ENOUGH_SPACE); + size_t size = sum_bytes_in_blocks + min_free_disk_space; + auto reservation = tmp_volume->reserve(size); + if (!reservation) + throw Exception("Not enough space for external sort in temporary storage", ErrorCodes::NOT_ENOUGH_SPACE); + const std::string tmp_path(reservation->getDisk()->getPath()); temporary_files.emplace_back(createTemporaryFile(tmp_path)); + const std::string & path = temporary_files.back()->path(); merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); auto current_processor = std::make_shared(header_without_constants, log, path); diff --git a/dbms/src/Processors/Transforms/MergeSortingTransform.h b/dbms/src/Processors/Transforms/MergeSortingTransform.h index ecfaeb4f272..09c2b182fc7 100644 --- a/dbms/src/Processors/Transforms/MergeSortingTransform.h +++ b/dbms/src/Processors/Transforms/MergeSortingTransform.h @@ -9,6 +9,9 @@ namespace DB { +class Volume; +using VolumePtr = std::shared_ptr; + class MergeSortingTransform : public SortingTransform { public: @@ -17,7 +20,7 @@ public: const SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_, size_t max_bytes_before_remerge_, - size_t max_bytes_before_external_sort_, const std::string & tmp_path_, + size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, size_t min_free_disk_space_); String getName() const override { return "MergeSortingTransform"; } @@ -32,7 +35,7 @@ protected: private: size_t max_bytes_before_remerge; size_t max_bytes_before_external_sort; - const std::string tmp_path; + VolumePtr tmp_volume; size_t min_free_disk_space; Logger * log = &Logger::get("MergeSortingTransform"); diff --git a/dbms/src/Processors/tests/processors_test_aggregation.cpp b/dbms/src/Processors/tests/processors_test_aggregation.cpp index ccf31d953ac..903633c18ec 100644 --- a/dbms/src/Processors/tests/processors_test_aggregation.cpp +++ b/dbms/src/Processors/tests/processors_test_aggregation.cpp @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -187,6 +189,8 @@ try auto & factory = AggregateFunctionFactory::instance(); auto cur_path = Poco::Path().absolute().toString(); + auto disk = std::make_shared("tmp", cur_path, 0); + auto tmp_volume = std::make_shared("tmp", std::vector{disk}, 0); auto execute_one_stream = [&](String msg, size_t num_threads, bool two_level, bool external) { @@ -228,7 +232,7 @@ try group_by_two_level_threshold_bytes, max_bytes_before_external_group_by, false, /// empty_result_for_aggregation_by_empty_set - cur_path, /// tmp_path + tmp_volume, 1, /// max_threads 0 ); @@ -301,7 +305,7 @@ try group_by_two_level_threshold_bytes, max_bytes_before_external_group_by, false, /// empty_result_for_aggregation_by_empty_set - cur_path, /// tmp_path + tmp_volume, 1, /// max_threads 0 ); diff --git a/dbms/src/Processors/tests/processors_test_merge_sorting_transform.cpp b/dbms/src/Processors/tests/processors_test_merge_sorting_transform.cpp index 8e6b4655127..f0b20959e87 100644 --- a/dbms/src/Processors/tests/processors_test_merge_sorting_transform.cpp +++ b/dbms/src/Processors/tests/processors_test_merge_sorting_transform.cpp @@ -1,6 +1,8 @@ #include #include +#include +#include #include #include @@ -116,7 +118,10 @@ try Logger::root().setChannel(channel); Logger::root().setLevel("trace"); - auto execute_chain = []( + auto disk = std::make_shared("tmp", ".", 0); + auto tmp_volume = std::make_shared("tmp", std::vector{disk}, 0); + + auto execute_chain = [tmp_volume]( String msg, UInt64 source_block_size, UInt64 blocks_count, @@ -133,7 +138,9 @@ try SortDescription description = {{0, 1, 1}}; auto transform = std::make_shared( source->getPort().getHeader(), description, - max_merged_block_size, limit, max_bytes_before_remerge, max_bytes_before_external_sort, ".", 0); + max_merged_block_size, limit, + max_bytes_before_remerge, max_bytes_before_external_sort, + tmp_volume, 0); auto sink = std::make_shared(); connect(source->getPort(), transform->getInputs().front()); From c9cc1ef51691a66a467c993bd8bc2d770283eade Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 19 Jan 2020 22:16:33 +0300 Subject: [PATCH 76/89] Cover tmp_policy --- .../integration/test_tmp_policy/__init__.py | 0 .../config.d/storage_configuration.xml | 25 ++++++++++++++ .../tests/integration/test_tmp_policy/test.py | 34 +++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 dbms/tests/integration/test_tmp_policy/__init__.py create mode 100644 dbms/tests/integration/test_tmp_policy/configs/config.d/storage_configuration.xml create mode 100644 dbms/tests/integration/test_tmp_policy/test.py diff --git a/dbms/tests/integration/test_tmp_policy/__init__.py b/dbms/tests/integration/test_tmp_policy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_tmp_policy/configs/config.d/storage_configuration.xml b/dbms/tests/integration/test_tmp_policy/configs/config.d/storage_configuration.xml new file mode 100644 index 00000000000..f8574a38208 --- /dev/null +++ b/dbms/tests/integration/test_tmp_policy/configs/config.d/storage_configuration.xml @@ -0,0 +1,25 @@ + + + + + /disk1/ + + + /disk2/ + + + + + + +
+ disk1 + disk2 +
+
+
+
+
+ + tmp +
diff --git a/dbms/tests/integration/test_tmp_policy/test.py b/dbms/tests/integration/test_tmp_policy/test.py new file mode 100644 index 00000000000..5c5900cc9dc --- /dev/null +++ b/dbms/tests/integration/test_tmp_policy/test.py @@ -0,0 +1,34 @@ +# pylint: disable=unused-argument +# pylint: disable=redefined-outer-name + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance('node', + config_dir='configs', + tmpfs=['/disk1:size=100M', '/disk2:size=100M']) + +@pytest.fixture(scope='module') +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + +def test_different_versions(start_cluster): + query = 'SELECT count(ignore(*)) FROM (SELECT * FROM system.numbers LIMIT 1e7) GROUP BY number' + settings = { + 'max_bytes_before_external_group_by': 1<<20, + 'max_bytes_before_external_sort': 1<<20, + } + + assert node.contains_in_log('Setting up /disk1/ to store temporary data in it') + assert node.contains_in_log('Setting up /disk2/ to store temporary data in it') + + node.query(query, settings=settings) + assert node.contains_in_log('Writing part of aggregation data into temporary file /disk1/') + assert node.contains_in_log('Writing part of aggregation data into temporary file /disk2/') From c7616ff11aed9701622802636a49c96669fb61db Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Thu, 23 Jan 2020 20:48:26 +0300 Subject: [PATCH 77/89] [wip] performance comparison --- docker/test/performance-comparison/compare.sh | 8 +- .../test/performance-comparison/entrypoint.sh | 2 +- docker/test/performance-comparison/eqmed.sql | 8 +- docker/test/performance-comparison/report.py | 105 ++++++++++++++++++ 4 files changed, 115 insertions(+), 8 deletions(-) create mode 100755 docker/test/performance-comparison/report.py diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 098800d5573..4b4501892e7 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -146,7 +146,9 @@ run_tests # Analyze results result_structure="left float, right float, diff float, rd Array(float), query text" -right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where diff < 0.05 and rd[3] > 0.05 order by rd[3] desc" > flap-prone.tsv -right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where diff > 0.05 and diff > rd[3] order by diff desc" > bad-perf.tsv -right/clickhouse local --file '*-client-time.tsv' -S "query text, client float, server float" -q "select *, floor(client/server, 3) p from table order by p desc" > client-time.tsv +right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where abs(diff) < 0.05 and rd[3] > 0.05 order by rd[3] desc" > unstable.tsv +right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where abs(diff) > 0.05 and abs(diff) > rd[3] order by diff desc" > changed-perf.tsv +right/clickhouse local --file '*-client-time.tsv' -S "query text, client float, server float" -q "select client, server, floor(client/server, 3) p, query from table where p > 1.01 order by p desc" > slow-on-client.tsv grep Exception:[^:] *-err.log > run-errors.log + +./report.py > report.html diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh index 589bb58fe8b..3a4d33326af 100755 --- a/docker/test/performance-comparison/entrypoint.sh +++ b/docker/test/performance-comparison/entrypoint.sh @@ -29,5 +29,5 @@ set -m time ../compare.sh 0 $ref_sha $PR_TO_TEST $SHA_TO_TEST 2>&1 | ts | tee compare.log set +m -7z a /output/output.7z *.log *.tsv +7z a /output/output.7z *.log *.tsv *.html cp compare.log /output diff --git a/docker/test/performance-comparison/eqmed.sql b/docker/test/performance-comparison/eqmed.sql index 5e8d842b7df..cdc7cbec85f 100644 --- a/docker/test/performance-comparison/eqmed.sql +++ b/docker/test/performance-comparison/eqmed.sql @@ -1,10 +1,10 @@ -- input is table(query text, run UInt32, version int, time float) select -- abs(diff_percent) > rd_quantiles_percent[3] fail, - floor(original_medians_array.time_by_version[1], 4) m1, - floor(original_medians_array.time_by_version[2], 4) m2, - floor((m1 - m2) / m1, 3) diff_percent, - arrayMap(x -> floor(x / m1, 3), rd.rd_quantiles) rd_quantiles_percent, + floor(original_medians_array.time_by_version[1], 4) left, + floor(original_medians_array.time_by_version[2], 4) right, + floor((right - left) / left, 3) diff_percent, + arrayMap(x -> floor(x / left, 3), rd.rd_quantiles) rd_quantiles_percent, query from ( diff --git a/docker/test/performance-comparison/report.py b/docker/test/performance-comparison/report.py new file mode 100755 index 00000000000..64461ba0587 --- /dev/null +++ b/docker/test/performance-comparison/report.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 + +import collections +import csv +import os +import sys + +doc_template = """ + + + + {header} + + +
+ +

{header}

+{test_part} + + + +""" + +table_template = """ +

{caption}

+ +{header} +{rows} +
+""" + +def tr(x): + return '' + str(x) + '' + +def td(x): + return '' + str(x) + '' + +def th(x): + return '' + str(x) + '' + +def table_row(r): + return tr(''.join([td(f) for f in r])) + +def table_header(r): + return tr(''.join([th(f) for f in r])) + +def tsv_rows(n): + result = '' + with open(n) as fd: + for row in csv.reader(fd, delimiter="\t", quotechar='"'): + result += table_row(row) + return result + +params = collections.defaultdict(str) +params['header'] = "ClickHouse Performance Comparison" +params['test_part'] = (table_template.format_map( + collections.defaultdict(str, + caption = 'Changes in performance', + header = table_header(['Left', 'Right', 'Diff', 'RD', 'Query']), + rows = tsv_rows('changed-perf.tsv'))) + + table_template.format( + caption = 'Slow on client', + header = table_header(['Client', 'Server', 'Ratio', 'Query']), + rows = tsv_rows('slow-on-client.tsv')) + + table_template.format( + caption = 'Unstable', + header = table_header(['Left', 'Right', 'Diff', 'RD', 'Query']), + rows = tsv_rows('unstable.tsv')) + + table_template.format( + caption = 'Run errors', + header = table_header(['A', 'B']), + rows = tsv_rows('run-errors.log')) +) +print(doc_template.format_map(params)) From 4f868614d518c6d50ff28e0bf3934507fb6c23ef Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 23 Jan 2020 21:08:27 +0300 Subject: [PATCH 78/89] Document tmp_policy --- docs/en/operations/server_settings/settings.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index c13d53eabc7..550a84350d9 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -723,7 +723,7 @@ Example 9004 ``` -## tmp_path +## tmp_path {#server-settings-tmp_path} Path to temporary data for processing large queries. @@ -737,6 +737,17 @@ Path to temporary data for processing large queries. ``` +## tmp_policy {#server-settings-tmp_policy} + +Policy from [`storage_configuration`](mergetree.md#table_engine-mergetree-multiple-volumes) to store temporary files. +If not set [`tmp_path`](#server-settings-tmp_path) is used, otherwise it is ignored. + +!!! note + - `move_factor` is ignored + - `keep_free_space_bytes` is ignored + - `max_data_part_size_bytes` is ignored + - you must have exactly one volume in that policy + ## uncompressed_cache_size {#server-settings-uncompressed_cache_size} Cache size (in bytes) for uncompressed data used by table engines from the [MergeTree](../../operations/table_engines/mergetree.md). From 59d9bfa71edd6f0b56fa83a9cdf65dd8973a74a6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 21:47:17 +0300 Subject: [PATCH 79/89] Enable Avro with Memory Sanitizer --- cmake/sanitize.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 7f23524bdee..13947425f7b 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -50,7 +50,6 @@ if (SANITIZE) set (USE_SIMDJSON 0 CACHE BOOL "") set (ENABLE_ORC 0 CACHE BOOL "") set (ENABLE_PARQUET 0 CACHE BOOL "") - set (ENABLE_AVRO 0 CACHE BOOL "") set (USE_CAPNP 0 CACHE BOOL "") set (USE_INTERNAL_ORC_LIBRARY 0 CACHE BOOL "") set (USE_ORC 0 CACHE BOOL "") From 5ab0e23a0ce2fa8596c03412ad0b736f867644d6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 21:52:23 +0300 Subject: [PATCH 80/89] Forked Avro to make changes for UBSan --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index ea4fd74dd8c..dab5b985760 100644 --- a/.gitmodules +++ b/.gitmodules @@ -142,5 +142,5 @@ url = https://github.com/ClickHouse-Extras/ryu.git [submodule "contrib/avro"] path = contrib/avro - url = https://github.com/apache/avro.git + url = https://github.com/ClickHouse-Extras/avro.git ignore = untracked From 64984cf06126026b6482da58f4046ca0b560ebc7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 22:06:53 +0300 Subject: [PATCH 81/89] Update Avro submodule to prevent UBSan failure --- contrib/avro | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/avro b/contrib/avro index 89218262cde..e852a8c15ed 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit 89218262cde62e98fcb3778b86cd3f03056c54f3 +Subproject commit e852a8c15ed0ff704a76b0c74a0fcf675ab270c0 From 205f798491c4427eae9a518c08574fdd2d4ac970 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 23 Jan 2020 23:04:53 +0300 Subject: [PATCH 82/89] Update Avro submodule to prevent UBSan failure --- contrib/avro | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/avro b/contrib/avro index e852a8c15ed..5b2752041c8 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit e852a8c15ed0ff704a76b0c74a0fcf675ab270c0 +Subproject commit 5b2752041c8d2f75eb5c1dbec8b4c25fc0e24d12 From 12d980259efbf9ce593e02e58c34b574f674f913 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 24 Jan 2020 05:38:03 +0300 Subject: [PATCH 83/89] Fixed bug with parentheses --- dbms/src/Common/Exception.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Common/Exception.cpp b/dbms/src/Common/Exception.cpp index 25da9674e4d..b536898c725 100644 --- a/dbms/src/Common/Exception.cpp +++ b/dbms/src/Common/Exception.cpp @@ -195,7 +195,7 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded << ", e.displayText() = " << e.displayText() << (with_stacktrace ? getExceptionStackTraceString(e) : "") << (with_extra_info ? getExtraExceptionInfo(e) : "") - << " (version " << VERSION_STRING << VERSION_OFFICIAL; + << " (version " << VERSION_STRING << VERSION_OFFICIAL << ")"; } catch (...) {} } @@ -212,7 +212,7 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what() << (with_stacktrace ? getExceptionStackTraceString(e) : "") << (with_extra_info ? getExtraExceptionInfo(e) : "") - << ", version = " << VERSION_STRING << VERSION_OFFICIAL; + << ", version = " << VERSION_STRING << VERSION_OFFICIAL << ")"; } catch (...) {} } From 3a5c7370b21a214e4aa63f12cd87ea597dcca1f3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 24 Jan 2020 07:05:15 +0300 Subject: [PATCH 84/89] Addition to prev. revision #8811 --- dbms/src/Common/Exception.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/Exception.cpp b/dbms/src/Common/Exception.cpp index b536898c725..318da1a27f2 100644 --- a/dbms/src/Common/Exception.cpp +++ b/dbms/src/Common/Exception.cpp @@ -212,7 +212,7 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what() << (with_stacktrace ? getExceptionStackTraceString(e) : "") << (with_extra_info ? getExtraExceptionInfo(e) : "") - << ", version = " << VERSION_STRING << VERSION_OFFICIAL << ")"; + << ", version = " << VERSION_STRING << VERSION_OFFICIAL; } catch (...) {} } From 7af36b9b747526bc77a998813069b966a0b592e9 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 24 Jan 2020 11:07:20 +0300 Subject: [PATCH 85/89] Better image for woboq browser --- docker/test/codebrowser/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/test/codebrowser/Dockerfile b/docker/test/codebrowser/Dockerfile index 0c612dff0bd..3ed3b250f0c 100644 --- a/docker/test/codebrowser/Dockerfile +++ b/docker/test/codebrowser/Dockerfile @@ -24,7 +24,9 @@ RUN sudo apt-get --yes --allow-unauthenticated update RUN sudo apt-get --yes --allow-unauthenticated install cmake clang-8 libllvm8 libclang-8-dev # repo versions doesn't work correctly with C++17 -RUN git clone https://github.com/woboq/woboq_codebrowser.git +# also we push reports to s3, so we add index.html to subfolder urls +# https://github.com/ClickHouse-Extras/woboq_codebrowser/commit/37e15eaf377b920acb0b48dbe82471be9203f76b +RUN git clone https://github.com/ClickHouse-Extras/woboq_codebrowser RUN cd woboq_codebrowser && cmake . -DCMAKE_BUILD_TYPE=Release && make -j ENV CODEGEN=/woboq_codebrowser/generator/codebrowser_generator @@ -40,6 +42,6 @@ CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \ cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-8 -DCMAKE_C_COMPILER=/usr/bin/clang-8 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON && \ mkdir -p $HTML_RESULT_DIRECTORY && \ $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA && \ - $CODEINDEX $HTML_RESULT_DIRECTORY && \ cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\ + $CODEINDEX $HTML_RESULT_DIRECTORY -d "data" && \ mv $HTML_RESULT_DIRECTORY /test_output From 32783b3ceacfc67102a525ee6ea5bec959ef901f Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 24 Jan 2020 11:55:36 +0300 Subject: [PATCH 86/89] Fix integration test --- .../integration/test_parts_delete_zookeeper/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/tests/integration/test_parts_delete_zookeeper/test.py b/dbms/tests/integration/test_parts_delete_zookeeper/test.py index 1c23a4a658a..7e4a8d36741 100644 --- a/dbms/tests/integration/test_parts_delete_zookeeper/test.py +++ b/dbms/tests/integration/test_parts_delete_zookeeper/test.py @@ -40,13 +40,13 @@ def test_merge_doesnt_work_without_zookeeper(start_cluster): assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "2\n" node1.query("OPTIMIZE TABLE test_table FINAL") - assert node1.query("SELECT count(*) from system.parts") == "3\n" + assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" - assert_eq_with_retry(node1, "SELECT count(*) from system.parts", "1") + assert_eq_with_retry(node1, "SELECT count(*) from system.parts where table = 'test_table' and active = 1", "1") node1.query("TRUNCATE TABLE test_table") - assert node1.query("SELECT count(*) from system.parts") == "0\n" + assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "0\n" node1.query("INSERT INTO test_table VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)") node1.query("INSERT INTO test_table VALUES ('2018-10-01', 4), ('2018-10-02', 5), ('2018-10-03', 6)") @@ -56,6 +56,6 @@ def test_merge_doesnt_work_without_zookeeper(start_cluster): node1.query("OPTIMIZE TABLE test_table FINAL") pm.drop_instance_zk_connections(node1) time.sleep(10) # > old_parts_lifetime - assert node1.query("SELECT count(*) from system.parts") == "3\n" + assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" - assert_eq_with_retry(node1, "SELECT count(*) from system.parts", "1") + assert_eq_with_retry(node1, "SELECT count(*) from system.parts where table = 'test_table' and active = 1", "1") From 95fbef59c9b2fd03688ebb1a3464ee7e23342623 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 24 Jan 2020 12:16:34 +0300 Subject: [PATCH 87/89] Fix prometheus test --- .../test_prometheus_endpoint/test.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/dbms/tests/integration/test_prometheus_endpoint/test.py b/dbms/tests/integration/test_prometheus_endpoint/test.py index dcd31621cb5..25d83cfb47c 100644 --- a/dbms/tests/integration/test_prometheus_endpoint/test.py +++ b/dbms/tests/integration/test_prometheus_endpoint/test.py @@ -3,6 +3,7 @@ import pytest import re import requests +import time from helpers.cluster import ClickHouseCluster @@ -24,7 +25,7 @@ def parse_response_line(line): "# HELP", "# TYPE", ] - assert any(line.startswith(prefix) for prefix in allowed_prefixes), msg + assert any(line.startswith(prefix) for prefix in allowed_prefixes) if line.startswith("#"): return {} @@ -34,12 +35,23 @@ def parse_response_line(line): return {name: int(val)} -def get_and_check_metrics(): - response = requests.get("http://{host}:{port}/metrics".format( - host=node.ip_address, port=8001), allow_redirects=False) +def get_and_check_metrics(retries): + while True: + try: + response = requests.get("http://{host}:{port}/metrics".format( + host=node.ip_address, port=8001), allow_redirects=False) - if response.status_code != 200: - response.raise_for_status() + if response.status_code != 200: + response.raise_for_status() + + break + except: + if retries >= 0: + retries -= 1 + time.sleep(0.5) + continue + else: + raise assert response.headers['content-type'].startswith('text/plain') @@ -55,13 +67,13 @@ def get_and_check_metrics(): def test_prometheus_endpoint(start_cluster): - metrics_dict = get_and_check_metrics() + metrics_dict = get_and_check_metrics(10) assert metrics_dict['ClickHouseProfileEvents_Query'] >= 0 prev_query_count = metrics_dict['ClickHouseProfileEvents_Query'] - resp = node.query("SELECT 1") - resp = node.query("SELECT 2") - resp = node.query("SELECT 3") + node.query("SELECT 1") + node.query("SELECT 2") + node.query("SELECT 3") - metrics_dict = get_and_check_metrics() + metrics_dict = get_and_check_metrics(10) assert metrics_dict['ClickHouseProfileEvents_Query'] >= prev_query_count + 3 From 9fe94ca5f75b108edfaa93c0fa0df73c9c7fe674 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 24 Jan 2020 13:47:19 +0300 Subject: [PATCH 88/89] fix confusing logic (addition to #8800) --- dbms/src/Storages/MergeTree/MergeTreeData.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index ba2af73a421..ab5644749ee 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -579,7 +579,7 @@ public: bool hasAnyColumnTTL() const { return !column_ttl_entries_by_name.empty(); } bool hasAnyMoveTTL() const { return !move_ttl_entries.empty(); } - bool hasRowsTTL() const { return rows_ttl_entry.isEmpty(); } + bool hasRowsTTL() const { return !rows_ttl_entry.isEmpty(); } /// Check that the part is not broken and calculate the checksums for it if they are not present. MutableDataPartPtr loadPartAndFixMetadata(const DiskPtr & disk, const String & relative_path); @@ -738,7 +738,7 @@ public: /// Checks if given part already belongs destination disk or volume for this rule. bool isPartInDestination(const StoragePolicyPtr & policy, const MergeTreeDataPart & part) const; - bool isEmpty() const { return expression != nullptr; } + bool isEmpty() const { return expression == nullptr; } }; std::optional selectTTLEntryForTTLInfos(const MergeTreeDataPart::TTLInfos & ttl_infos, time_t time_of_move) const; From aeb9557edfcc4e394afb8416988e31c6c7812df9 Mon Sep 17 00:00:00 2001 From: Mikhail Korotov Date: Fri, 24 Jan 2020 20:04:42 +0300 Subject: [PATCH 89/89] tests added --- ..._secondary_index_with_old_format_merge_tree.reference | 0 ...bition_secondary_index_with_old_format_merge_tree.sql | 9 +++++++++ 2 files changed, 9 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.reference create mode 100644 dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql diff --git a/dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.reference b/dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql b/dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql new file mode 100644 index 00000000000..50259c28420 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql @@ -0,0 +1,9 @@ +CREATE TABLE old_syntax_01071_test (date Date, id UInt8) ENGINE = MergeTree(date, id, 8192); +SET allow_experimental_data_skipping_indices=1; +ALTER TABLE old_syntax_01071_test ADD INDEX id_minmax id TYPE minmax GRANULARITY 1; -- { serverError 36 } +CREATE TABLE new_syntax_01071_test (date Date, id UInt8) ENGINE = MergeTree() ORDER BY id; +ALTER TABLE new_syntax_01071_test ADD INDEX id_minmax id TYPE minmax GRANULARITY 1; +DETACH TABLE new_syntax_01071_test; +ATTACH TABLE new_syntax_01071_test; +DROP TABLE IF EXISTS old_syntax_01071_test; +DROP TABLE IF EXISTS new_syntax_01071_test;