Merge branch 'master' into simplify-clickhouse-test

This commit is contained in:
Alexey Milovidov 2019-04-24 19:07:17 +03:00
commit cb763f03e9
74 changed files with 1096 additions and 529 deletions

View File

@ -59,7 +59,7 @@ if (NOT MAKE_STATIC_LIBRARIES)
endif ()
if (SPLIT_SHARED_LIBRARIES)
set (LINK_MODE SHARED)
set(BUILD_SHARED_LIBS 1 CACHE INTERNAL "")
endif ()
if (USE_STATIC_LIBRARIES)

View File

@ -1,9 +1,8 @@
if (OS_FREEBSD)
find_library (EXECINFO_LIBRARY execinfo)
find_library (ELF_LIBRARY elf)
message (STATUS "Using execinfo: ${EXECINFO_LIBRARY}")
message (STATUS "Using elf: ${ELF_LIBRARY}")
set (EXECINFO_LIBRARIES ${EXECINFO_LIBRARY} ${ELF_LIBRARY})
message (STATUS "Using execinfo: ${EXECINFO_LIBRARIES}")
else ()
set (EXECINFO_LIBRARY "")
set (ELF_LIBRARY "")
set (EXECINFO_LIBRARIES "")
endif ()

View File

@ -41,7 +41,7 @@ set( thriftcpp_threads_SOURCES
${LIBRARY_DIR}/src/thrift/concurrency/Monitor.cpp
${LIBRARY_DIR}/src/thrift/concurrency/Mutex.cpp
)
add_library(${THRIFT_LIBRARY} ${LINK_MODE} ${thriftcpp_SOURCES} ${thriftcpp_threads_SOURCES})
add_library(${THRIFT_LIBRARY} ${thriftcpp_SOURCES} ${thriftcpp_threads_SOURCES})
set_target_properties(${THRIFT_LIBRARY} PROPERTIES CXX_STANDARD 14) # REMOVE after https://github.com/apache/thrift/pull/1641
target_include_directories(${THRIFT_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/thrift/lib/cpp/src PRIVATE ${Boost_INCLUDE_DIRS})
@ -149,7 +149,7 @@ if (ARROW_WITH_ZSTD)
endif()
add_library(${ARROW_LIBRARY} ${LINK_MODE} ${ARROW_SRCS})
add_library(${ARROW_LIBRARY} ${ARROW_SRCS})
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src ${Boost_INCLUDE_DIRS})
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} Threads::Threads)
if (ARROW_WITH_LZ4)
@ -195,7 +195,7 @@ list(APPEND PARQUET_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/parquet/parquet_constants.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/src/parquet/parquet_types.cpp
)
add_library(${PARQUET_LIBRARY} ${LINK_MODE} ${PARQUET_SRCS})
add_library(${PARQUET_LIBRARY} ${PARQUET_SRCS})
target_include_directories(${PARQUET_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
include(${ClickHouse_SOURCE_DIR}/contrib/thrift/build/cmake/ConfigureChecks.cmake) # makes config.h
target_link_libraries(${PARQUET_LIBRARY} PUBLIC ${ARROW_LIBRARY} PRIVATE ${THRIFT_LIBRARY} ${Boost_REGEX_LIBRARY})

View File

@ -24,7 +24,7 @@ endif ()
configure_file(config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
add_library(base64 ${LINK_MODE}
add_library(base64
${LIBRARY_DIR}/lib/lib.c
${LIBRARY_DIR}/lib/codec_choose.c
${LIBRARY_DIR}/lib/arch/avx/codec.c

View File

@ -20,7 +20,7 @@ endif()
macro(add_boost_lib lib_name)
add_headers_and_sources(boost_${lib_name} ${LIBRARY_DIR}/libs/${lib_name}/src)
add_library(boost_${lib_name}_internal ${LINK_MODE} ${boost_${lib_name}_sources})
add_library(boost_${lib_name}_internal ${boost_${lib_name}_sources})
target_include_directories(boost_${lib_name}_internal SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_compile_definitions(boost_${lib_name}_internal PUBLIC BOOST_SYSTEM_NO_DEPRECATED)
endmacro()

View File

@ -28,6 +28,6 @@ set(SRCS
${BROTLI_SOURCE_DIR}/common/transform.c
)
add_library(brotli ${LINK_MODE} ${SRCS})
add_library(brotli ${SRCS})
target_include_directories(brotli PUBLIC ${BROTLI_SOURCE_DIR}/include)

View File

@ -1,6 +1,6 @@
SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/cctz)
add_library(cctz ${LINK_MODE}
add_library(cctz
${LIBRARY_DIR}/src/civil_time_detail.cc
${LIBRARY_DIR}/src/time_zone_fixed.cc
${LIBRARY_DIR}/src/time_zone_format.cc

View File

@ -23,7 +23,7 @@ set(SRCS
${CPPKAFKA_DIR}/src/consumer.cpp
)
add_library(cppkafka ${LINK_MODE} ${SRCS})
add_library(cppkafka ${SRCS})
target_link_libraries(cppkafka PRIVATE ${RDKAFKA_LIBRARY})
target_include_directories(cppkafka PRIVATE ${CPPKAFKA_DIR}/include/cppkafka)

View File

@ -1,4 +1,4 @@
add_library (btrie
add_library(btrie
src/btrie.c
include/btrie.h
)

View File

@ -183,7 +183,7 @@ set(SRCS
)
# target
add_library(hdfs3 STATIC ${SRCS} ${PROTO_SOURCES} ${PROTO_HEADERS})
add_library(hdfs3 ${SRCS} ${PROTO_SOURCES} ${PROTO_HEADERS})
if (USE_INTERNAL_PROTOBUF_LIBRARY)
add_dependencies(hdfs3 protoc)

View File

@ -54,7 +54,7 @@ set(SRCS
${RDKAFKA_SOURCE_DIR}/rdgz.c
)
add_library(rdkafka ${LINK_MODE} ${SRCS})
add_library(rdkafka ${SRCS})
target_include_directories(rdkafka SYSTEM PUBLIC include)
target_include_directories(rdkafka SYSTEM PUBLIC ${RDKAFKA_SOURCE_DIR}) # Because weird logic with "include_next" is used.
target_include_directories(rdkafka SYSTEM PRIVATE ${ZSTD_INCLUDE_DIR}/common) # Because wrong path to "zstd_errors.h" is used.

View File

@ -16,7 +16,7 @@ set(SRCS
${LIBXML2_SOURCE_DIR}/debugXML.c
${LIBXML2_SOURCE_DIR}/xpointer.c
${LIBXML2_SOURCE_DIR}/DOCBparser.c
${LIBXML2_SOURCE_DIR}/xmlcatalog.c
#${LIBXML2_SOURCE_DIR}/xmlcatalog.c
${LIBXML2_SOURCE_DIR}/c14n.c
${LIBXML2_SOURCE_DIR}/xmlreader.c
${LIBXML2_SOURCE_DIR}/xmlstring.c
@ -26,7 +26,7 @@ set(SRCS
${LIBXML2_SOURCE_DIR}/trionan.c
${LIBXML2_SOURCE_DIR}/pattern.c
${LIBXML2_SOURCE_DIR}/globals.c
${LIBXML2_SOURCE_DIR}/xmllint.c
#${LIBXML2_SOURCE_DIR}/xmllint.c
${LIBXML2_SOURCE_DIR}/chvalid.c
${LIBXML2_SOURCE_DIR}/relaxng.c
${LIBXML2_SOURCE_DIR}/list.c
@ -40,7 +40,7 @@ set(SRCS
${LIBXML2_SOURCE_DIR}/xmlschemas.c
${LIBXML2_SOURCE_DIR}/SAX2.c
${LIBXML2_SOURCE_DIR}/threads.c
${LIBXML2_SOURCE_DIR}/runsuite.c
#${LIBXML2_SOURCE_DIR}/runsuite.c
${LIBXML2_SOURCE_DIR}/catalog.c
${LIBXML2_SOURCE_DIR}/uri.c
${LIBXML2_SOURCE_DIR}/xmlmodule.c
@ -48,7 +48,7 @@ set(SRCS
${LIBXML2_SOURCE_DIR}/parserInternals.c
${LIBXML2_SOURCE_DIR}/xmlwriter.c
${LIBXML2_SOURCE_DIR}/xmlunicode.c
${LIBXML2_SOURCE_DIR}/runxmlconf.c
#${LIBXML2_SOURCE_DIR}/runxmlconf.c
${LIBXML2_SOURCE_DIR}/xmlmemory.c
${LIBXML2_SOURCE_DIR}/nanoftp.c
${LIBXML2_SOURCE_DIR}/xmlschemastypes.c
@ -56,7 +56,7 @@ set(SRCS
${LIBXML2_SOURCE_DIR}/nanohttp.c
${LIBXML2_SOURCE_DIR}/schematron.c
)
add_library(libxml2 STATIC ${SRCS})
add_library(libxml2 ${SRCS})
target_link_libraries(libxml2 ${ZLIB_LIBRARIES})

View File

@ -2,7 +2,7 @@ set(MARIADB_CLIENT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/contrib/mariadb-connector-c)
set(MARIADB_CLIENT_BINARY_DIR ${CMAKE_BINARY_DIR}/contrib/mariadb-connector-c)
set(SRCS
${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/bmove_upp.c
#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/bmove_upp.c
${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/get_password.c
${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_alloc.c
${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_array.c
@ -58,10 +58,10 @@ if(OPENSSL_LIBRARIES)
list(APPEND SRCS ${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl.c)
endif()
add_library(mysqlclient STATIC ${SRCS})
add_library(mysqlclient ${SRCS})
if(OPENSSL_LIBRARIES)
target_link_libraries(mysqlclient ${OPENSSL_LIBRARIES})
target_link_libraries(mysqlclient PRIVATE ${OPENSSL_LIBRARIES})
target_compile_definitions(mysqlclient PRIVATE -D HAVE_OPENSSL -D HAVE_TLS)
endif()

View File

@ -10,7 +10,7 @@ foreach (src ${RE2_SOURCES_})
list(APPEND RE2_ST_SOURCES ${RE2_SOURCE_DIR}/${src})
endforeach ()
add_library (re2_st ${RE2_ST_SOURCES})
add_library(re2_st ${RE2_ST_SOURCES})
target_compile_definitions (re2_st PRIVATE NDEBUG NO_THREADS re2=re2_st)
target_include_directories (re2_st PRIVATE .)
target_include_directories (re2_st SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${RE2_SOURCE_DIR})

View File

@ -23,7 +23,7 @@ ${ODBC_SOURCE_DIR}/libltdl/loaders/preopen.c
${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/libltdl/libltdlcS.c
)
add_library(ltdl ${LINK_MODE} ${SRCS})
add_library(ltdl ${SRCS})
target_include_directories(ltdl PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/libltdl)
target_include_directories(ltdl PUBLIC ${ODBC_SOURCE_DIR}/libltdl)
@ -273,7 +273,7 @@ ${ODBC_SOURCE_DIR}/lst/lstSetFreeFunc.c
${ODBC_SOURCE_DIR}/lst/_lstVisible.c
)
add_library(unixodbc ${LINK_MODE} ${SRCS})
add_library(unixodbc ${SRCS})
target_link_libraries(unixodbc ltdl)

View File

@ -125,6 +125,6 @@ IF (ZSTD_LEGACY_SUPPORT)
${LIBRARY_LEGACY_DIR}/zstd_v07.h)
ENDIF (ZSTD_LEGACY_SUPPORT)
ADD_LIBRARY(zstd ${LINK_MODE} ${Sources} ${Headers})
ADD_LIBRARY(zstd ${Sources} ${Headers})
target_include_directories (zstd PUBLIC ${LIBRARY_DIR})

View File

@ -134,7 +134,7 @@ list (APPEND dbms_headers src/TableFunctions/ITableFunction.h src/TableFunctio
list (APPEND dbms_sources src/Dictionaries/DictionaryFactory.cpp src/Dictionaries/DictionarySourceFactory.cpp src/Dictionaries/DictionaryStructure.cpp)
list (APPEND dbms_headers src/Dictionaries/DictionaryFactory.h src/Dictionaries/DictionarySourceFactory.h src/Dictionaries/DictionaryStructure.h)
add_library(clickhouse_common_io ${LINK_MODE} ${clickhouse_common_io_headers} ${clickhouse_common_io_sources})
add_library(clickhouse_common_io ${clickhouse_common_io_headers} ${clickhouse_common_io_sources})
if (OS_FREEBSD)
target_compile_definitions (clickhouse_common_io PUBLIC CLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST)
@ -194,8 +194,7 @@ target_link_libraries (clickhouse_common_io
${CITYHASH_LIBRARIES}
PRIVATE
${ZLIB_LIBRARIES}
${EXECINFO_LIBRARY}
${ELF_LIBRARY}
${EXECINFO_LIBRARIES}
PUBLIC
${Boost_SYSTEM_LIBRARY}
PRIVATE

View File

@ -44,7 +44,7 @@ macro(clickhouse_program_add_library name)
set(CLICKHOUSE_${name_uc}_INCLUDE ${CLICKHOUSE_${name_uc}_INCLUDE} PARENT_SCOPE)
if(NOT CLICKHOUSE_ONE_SHARED)
add_library(clickhouse-${name}-lib ${LINK_MODE} ${CLICKHOUSE_${name_uc}_SOURCES})
add_library(clickhouse-${name}-lib ${CLICKHOUSE_${name_uc}_SOURCES})
set(_link ${CLICKHOUSE_${name_uc}_LINK}) # can't use ${} in if()
if(_link)

View File

@ -2,7 +2,7 @@ add_definitions(-Wno-error -Wno-unused-parameter -Wno-non-virtual-dtor -U_LIBCPP
link_directories(${LLVM_LIBRARY_DIRS})
add_library(clickhouse-compiler-lib ${LINK_MODE}
add_library(clickhouse-compiler-lib
driver.cpp
cc1_main.cpp
cc1as_main.cpp
@ -46,7 +46,7 @@ LLVMSupport
#PollyISL
#PollyPPCG
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARY} Threads::Threads
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARIES} Threads::Threads
${MALLOC_LIBRARIES}
${GLIBC_COMPATIBILITY_LIBRARIES}
${MEMCPY_LIBRARIES}

View File

@ -1,8 +1,9 @@
add_definitions(-Wno-error -Wno-unused-parameter -Wno-non-virtual-dtor -U_LIBCPP_DEBUG)
link_directories(${LLVM_LIBRARY_DIRS})
add_library(clickhouse-compiler-lib ${LINK_MODE}
add_library(clickhouse-compiler-lib
driver.cpp
cc1_main.cpp
cc1as_main.cpp
@ -46,7 +47,7 @@ ${REQUIRED_LLVM_LIBRARIES}
#PollyISL
#PollyPPCG
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARY} Threads::Threads
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARIES} Threads::Threads
${MALLOC_LIBRARIES}
${GLIBC_COMPATIBILITY_LIBRARIES}
${MEMCPY_LIBRARIES}

View File

@ -2,7 +2,7 @@ add_definitions(-Wno-error -Wno-unused-parameter -Wno-non-virtual-dtor -U_LIBCPP
link_directories(${LLVM_LIBRARY_DIRS})
add_library(clickhouse-compiler-lib ${LINK_MODE}
add_library(clickhouse-compiler-lib
driver.cpp
cc1_main.cpp
cc1gen_reproducer_main.cpp
@ -42,7 +42,7 @@ lldCore
${REQUIRED_LLVM_LIBRARIES}
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARY} Threads::Threads
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARIES} Threads::Threads
${MALLOC_LIBRARIES}
${GLIBC_COMPATIBILITY_LIBRARIES}
${MEMCPY_LIBRARIES}

View File

@ -2,7 +2,7 @@ add_definitions(-Wno-error -Wno-unused-parameter -Wno-non-virtual-dtor -U_LIBCPP
link_directories(${LLVM_LIBRARY_DIRS})
add_library(clickhouse-compiler-lib ${LINK_MODE}
add_library(clickhouse-compiler-lib
driver.cpp
cc1_main.cpp
cc1as_main.cpp
@ -42,7 +42,7 @@ lldCore
${REQUIRED_LLVM_LIBRARIES}
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARY} Threads::Threads
PUBLIC ${ZLIB_LIBRARIES} ${EXECINFO_LIBRARIES} Threads::Threads
${MALLOC_LIBRARIES}
${GLIBC_COMPATIBILITY_LIBRARIES}
${MEMCPY_LIBRARIES}

View File

@ -298,6 +298,8 @@ std::unordered_map<std::string, std::vector<std::size_t>> getTestQueryIndexes(co
{
std::unordered_map<std::string, std::vector<std::size_t>> result;
const auto & options = parsed_opts.options;
if (options.empty())
return result;
for (size_t i = 0; i < options.size() - 1; ++i)
{
const auto & opt = options[i];

View File

@ -9,55 +9,97 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int BAD_ARGUMENTS;
}
namespace
{
/// Substitute return type for Date and DateTime
class AggregateFunctionGroupUniqArrayDate : public AggregateFunctionGroupUniqArray<DataTypeDate::FieldType>
template <typename has_limit>
class AggregateFunctionGroupUniqArrayDate : public AggregateFunctionGroupUniqArray<DataTypeDate::FieldType, has_limit>
{
public:
AggregateFunctionGroupUniqArrayDate(const DataTypePtr & argument_type) : AggregateFunctionGroupUniqArray<DataTypeDate::FieldType>(argument_type) {}
AggregateFunctionGroupUniqArrayDate(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits<UInt64>::max()) : AggregateFunctionGroupUniqArray<DataTypeDate::FieldType, has_limit>(argument_type, max_elems_) {}
DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(std::make_shared<DataTypeDate>()); }
};
class AggregateFunctionGroupUniqArrayDateTime : public AggregateFunctionGroupUniqArray<DataTypeDateTime::FieldType>
template <typename has_limit>
class AggregateFunctionGroupUniqArrayDateTime : public AggregateFunctionGroupUniqArray<DataTypeDateTime::FieldType, has_limit>
{
public:
AggregateFunctionGroupUniqArrayDateTime(const DataTypePtr & argument_type) : AggregateFunctionGroupUniqArray<DataTypeDateTime::FieldType>(argument_type) {}
AggregateFunctionGroupUniqArrayDateTime(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits<UInt64>::max()) : AggregateFunctionGroupUniqArray<DataTypeDateTime::FieldType, has_limit>(argument_type, max_elems_) {}
DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(std::make_shared<DataTypeDateTime>()); }
};
static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type)
template <typename has_limit, typename ... TArgs>
static IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, TArgs && ... args)
{
WhichDataType which(argument_type);
if (which.idx == TypeIndex::Date) return new AggregateFunctionGroupUniqArrayDate(argument_type);
else if (which.idx == TypeIndex::DateTime) return new AggregateFunctionGroupUniqArrayDateTime(argument_type);
if (which.idx == TypeIndex::Date) return new AggregateFunctionGroupUniqArrayDate<has_limit>(argument_type, std::forward<TArgs>(args)...);
else if (which.idx == TypeIndex::DateTime) return new AggregateFunctionGroupUniqArrayDateTime<has_limit>(argument_type, std::forward<TArgs>(args)...);
else
{
/// Check that we can use plain version of AggreagteFunctionGroupUniqArrayGeneric
if (argument_type->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
return new AggreagteFunctionGroupUniqArrayGeneric<true>(argument_type);
return new AggreagteFunctionGroupUniqArrayGeneric<true, has_limit>(argument_type, std::forward<TArgs>(args)...);
else
return new AggreagteFunctionGroupUniqArrayGeneric<false>(argument_type);
return new AggreagteFunctionGroupUniqArrayGeneric<false, has_limit>(argument_type, std::forward<TArgs>(args)...);
}
}
template <typename has_limit, typename ... TArgs>
inline AggregateFunctionPtr createAggregateFunctionGroupUniqArrayImpl(const std::string & name, const DataTypePtr & argument_type, TArgs ... args)
{
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionGroupUniqArray, has_limit, const DataTypePtr &, TArgs...>(*argument_type, argument_type, std::forward<TArgs>(args)...));
if (!res)
res = AggregateFunctionPtr(createWithExtraTypes<has_limit>(argument_type, std::forward<TArgs>(args)...));
if (!res)
throw Exception("Illegal type " + argument_type->getName() +
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return res;
}
AggregateFunctionPtr createAggregateFunctionGroupUniqArray(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertNoParameters(name, parameters);
assertUnary(name, argument_types);
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionGroupUniqArray>(*argument_types[0], argument_types[0]));
bool limit_size = false;
UInt64 max_elems = std::numeric_limits<UInt64>::max();
if (!res)
res = AggregateFunctionPtr(createWithExtraTypes(argument_types[0]));
if (parameters.empty())
{
// no limit
}
else if (parameters.size() == 1)
{
auto type = parameters[0].getType();
if (type != Field::Types::Int64 && type != Field::Types::UInt64)
throw Exception("Parameter for aggregate function " + name + " should be positive number", ErrorCodes::BAD_ARGUMENTS);
if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() +
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if ((type == Field::Types::Int64 && parameters[0].get<Int64>() < 0) ||
(type == Field::Types::UInt64 && parameters[0].get<UInt64>() == 0))
throw Exception("Parameter for aggregate function " + name + " should be positive number", ErrorCodes::BAD_ARGUMENTS);
return res;
limit_size = true;
max_elems = parameters[0].get<UInt64>();
}
else
throw Exception("Incorrect number of parameters for aggregate function " + name + ", should be 0 or 1",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!limit_size)
return createAggregateFunctionGroupUniqArrayImpl<std::false_type>(name, argument_types[0]);
else
return createAggregateFunctionGroupUniqArrayImpl<std::true_type>(name, argument_types[0], max_elems);
}
}

View File

@ -36,16 +36,21 @@ struct AggregateFunctionGroupUniqArrayData
/// Puts all values to the hash set. Returns an array of unique values. Implemented for numeric types.
template <typename T>
template <typename T, typename Tlimit_num_elem>
class AggregateFunctionGroupUniqArray
: public IAggregateFunctionDataHelper<AggregateFunctionGroupUniqArrayData<T>, AggregateFunctionGroupUniqArray<T>>
: public IAggregateFunctionDataHelper<AggregateFunctionGroupUniqArrayData<T>, AggregateFunctionGroupUniqArray<T, Tlimit_num_elem>>
{
static constexpr bool limit_num_elems = Tlimit_num_elem::value;
UInt64 max_elems;
private:
using State = AggregateFunctionGroupUniqArrayData<T>;
public:
AggregateFunctionGroupUniqArray(const DataTypePtr & argument_type)
: IAggregateFunctionDataHelper<AggregateFunctionGroupUniqArrayData<T>, AggregateFunctionGroupUniqArray<T>>({argument_type}, {}) {}
AggregateFunctionGroupUniqArray(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits<UInt64>::max())
: IAggregateFunctionDataHelper<AggregateFunctionGroupUniqArrayData<T>,
AggregateFunctionGroupUniqArray<T, Tlimit_num_elem>>({argument_type}, {}),
max_elems(max_elems_) {}
String getName() const override { return "groupUniqArray"; }
@ -56,12 +61,27 @@ public:
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
{
if (limit_num_elems && this->data(place).value.size() >= max_elems)
return;
this->data(place).value.insert(static_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
{
this->data(place).value.merge(this->data(rhs).value);
if (!limit_num_elems)
this->data(place).value.merge(this->data(rhs).value);
else
{
auto & cur_set = this->data(place).value;
auto & rhs_set = this->data(rhs).value;
for (auto & rhs_elem : rhs_set)
{
if (cur_set.size() >= max_elems)
return;
cur_set.insert(rhs_elem.getValue());
}
}
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
@ -111,25 +131,43 @@ struct AggreagteFunctionGroupUniqArrayGenericData
Set value;
};
/// Helper function for deserialize and insert for the class AggreagteFunctionGroupUniqArrayGeneric
template <bool is_plain_column>
static StringRef getSerializationImpl(const IColumn & column, size_t row_num, Arena & arena);
template <bool is_plain_column>
static void deserializeAndInsertImpl(StringRef str, IColumn & data_to);
/** Template parameter with true value should be used for columns that store their elements in memory continuously.
* For such columns groupUniqArray() can be implemented more efficiently (especially for small numeric arrays).
*/
template <bool is_plain_column = false>
template <bool is_plain_column = false, typename Tlimit_num_elem = std::false_type>
class AggreagteFunctionGroupUniqArrayGeneric
: public IAggregateFunctionDataHelper<AggreagteFunctionGroupUniqArrayGenericData, AggreagteFunctionGroupUniqArrayGeneric<is_plain_column>>
: public IAggregateFunctionDataHelper<AggreagteFunctionGroupUniqArrayGenericData, AggreagteFunctionGroupUniqArrayGeneric<is_plain_column, Tlimit_num_elem>>
{
DataTypePtr & input_data_type;
static constexpr bool limit_num_elems = Tlimit_num_elem::value;
UInt64 max_elems;
using State = AggreagteFunctionGroupUniqArrayGenericData;
static StringRef getSerialization(const IColumn & column, size_t row_num, Arena & arena);
static StringRef getSerialization(const IColumn & column, size_t row_num, Arena & arena)
{
return getSerializationImpl<is_plain_column>(column, row_num, arena);
}
static void deserializeAndInsert(StringRef str, IColumn & data_to);
static void deserializeAndInsert(StringRef str, IColumn & data_to)
{
return deserializeAndInsertImpl<is_plain_column>(str, data_to);
}
public:
AggreagteFunctionGroupUniqArrayGeneric(const DataTypePtr & input_data_type)
: IAggregateFunctionDataHelper<AggreagteFunctionGroupUniqArrayGenericData, AggreagteFunctionGroupUniqArrayGeneric<is_plain_column>>({input_data_type}, {})
, input_data_type(this->argument_types[0]) {}
AggreagteFunctionGroupUniqArrayGeneric(const DataTypePtr & input_data_type, UInt64 max_elems_ = std::numeric_limits<UInt64>::max())
: IAggregateFunctionDataHelper<AggreagteFunctionGroupUniqArrayGenericData, AggreagteFunctionGroupUniqArrayGeneric<is_plain_column, Tlimit_num_elem>>({input_data_type}, {})
, input_data_type(this->argument_types[0])
, max_elems(max_elems_) {}
String getName() const override { return "groupUniqArray"; }
@ -174,7 +212,10 @@ public:
bool inserted;
State::Set::iterator it;
if (limit_num_elems && set.size() >= max_elems)
return;
StringRef str_serialized = getSerialization(*columns[0], row_num, *arena);
set.emplace(str_serialized, it, inserted);
if constexpr (!is_plain_column)
@ -198,6 +239,8 @@ public:
State::Set::iterator it;
for (auto & rhs_elem : rhs_set)
{
if (limit_num_elems && cur_set.size() >= max_elems)
return ;
cur_set.emplace(rhs_elem.getValue(), it, inserted);
if (inserted)
{
@ -229,31 +272,30 @@ public:
template <>
inline StringRef AggreagteFunctionGroupUniqArrayGeneric<false>::getSerialization(const IColumn & column, size_t row_num, Arena & arena)
inline StringRef getSerializationImpl<false>(const IColumn & column, size_t row_num, Arena & arena)
{
const char * begin = nullptr;
return column.serializeValueIntoArena(row_num, arena, begin);
}
template <>
inline StringRef AggreagteFunctionGroupUniqArrayGeneric<true>::getSerialization(const IColumn & column, size_t row_num, Arena &)
inline StringRef getSerializationImpl<true>(const IColumn & column, size_t row_num, Arena &)
{
return column.getDataAt(row_num);
}
template <>
inline void AggreagteFunctionGroupUniqArrayGeneric<false>::deserializeAndInsert(StringRef str, IColumn & data_to)
inline void deserializeAndInsertImpl<false>(StringRef str, IColumn & data_to)
{
data_to.deserializeAndInsertFromArena(str.data);
}
template <>
inline void AggreagteFunctionGroupUniqArrayGeneric<true>::deserializeAndInsert(StringRef str, IColumn & data_to)
inline void deserializeAndInsertImpl<true>(StringRef str, IColumn & data_to)
{
data_to.insertData(str.data, str.size);
}
#undef AGGREGATE_FUNCTION_GROUP_ARRAY_UNIQ_MAX_SIZE
}

View File

@ -19,7 +19,7 @@ list(REMOVE_ITEM clickhouse_aggregate_functions_headers
FactoryHelpers.h
)
add_library(clickhouse_aggregate_functions ${LINK_MODE} ${clickhouse_aggregate_functions_sources})
add_library(clickhouse_aggregate_functions ${clickhouse_aggregate_functions_sources})
target_link_libraries(clickhouse_aggregate_functions PRIVATE dbms PUBLIC ${CITYHASH_LIBRARIES})
target_include_directories (clickhouse_aggregate_functions BEFORE PRIVATE ${COMMON_INCLUDE_DIR})

View File

@ -1,7 +1,7 @@
# TODO: make separate lib datastream, block, ...
#include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
#add_headers_and_sources(clickhouse_client .)
#add_library(clickhouse_client ${LINK_MODE} ${clickhouse_client_headers} ${clickhouse_client_sources})
#add_library(clickhouse_client ${clickhouse_client_headers} ${clickhouse_client_sources})
#target_link_libraries (clickhouse_client clickhouse_common_io ${Poco_Net_LIBRARY})
#target_include_directories (clickhouse_client PRIVATE ${DBMS_INCLUDE_DIR})

View File

@ -2,7 +2,7 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_common_config .)
add_library(clickhouse_common_config ${LINK_MODE} ${clickhouse_common_config_headers} ${clickhouse_common_config_sources})
add_library(clickhouse_common_config ${clickhouse_common_config_headers} ${clickhouse_common_config_sources})
target_link_libraries(clickhouse_common_config PUBLIC common PRIVATE clickhouse_common_zookeeper string_utils PUBLIC ${Poco_XML_LIBRARY} ${Poco_Util_LIBRARY} Threads::Threads)
target_include_directories(clickhouse_common_config PUBLIC ${DBMS_INCLUDE_DIR})

View File

@ -5,5 +5,5 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_common_stringutils .)
add_library(string_utils ${LINK_MODE} ${clickhouse_common_stringutils_headers} ${clickhouse_common_stringutils_sources})
add_library(string_utils ${clickhouse_common_stringutils_headers} ${clickhouse_common_stringutils_sources})
target_include_directories (string_utils PRIVATE ${DBMS_INCLUDE_DIR})

View File

@ -2,7 +2,7 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_common_zookeeper .)
add_library(clickhouse_common_zookeeper ${LINK_MODE} ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
add_library(clickhouse_common_zookeeper ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
target_link_libraries (clickhouse_common_zookeeper PUBLIC clickhouse_common_io common PRIVATE string_utils PUBLIC ${Poco_Util_LIBRARY} Threads::Threads)
target_include_directories(clickhouse_common_zookeeper PUBLIC ${DBMS_INCLUDE_DIR})

View File

@ -1,6 +1,6 @@
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_compression .)
add_library(clickhouse_compression ${LINK_MODE} ${clickhouse_compression_headers} ${clickhouse_compression_sources})
add_library(clickhouse_compression ${clickhouse_compression_headers} ${clickhouse_compression_sources})
target_link_libraries(clickhouse_compression PRIVATE clickhouse_parsers clickhouse_common_io ${ZSTD_LIBRARY} ${LZ4_LIBRARY} ${CITYHASH_LIBRARIES})
target_include_directories(clickhouse_compression PUBLIC ${DBMS_INCLUDE_DIR})
target_include_directories(clickhouse_compression SYSTEM PUBLIC ${PCG_RANDOM_INCLUDE_DIR})

View File

@ -153,7 +153,8 @@ struct Settings
\
M(SettingBool, add_http_cors_header, false, "Write add http CORS header.") \
\
M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow and TSKV formats).") \
M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).") \
M(SettingBool, input_format_with_names_use_header, false, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.") \
M(SettingBool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).") \
M(SettingBool, input_format_defaults_for_omitted_fields, false, "For input data calculate default expressions for omitted fields (it works for JSONEachRow format).") \
\

View File

@ -14,7 +14,7 @@ add_headers_and_sources(clickhouse_dictionaries ${CMAKE_CURRENT_BINARY_DIR}/gene
list(REMOVE_ITEM clickhouse_dictionaries_sources DictionaryFactory.cpp DictionarySourceFactory.cpp DictionaryStructure.cpp)
list(REMOVE_ITEM clickhouse_dictionaries_headers DictionaryFactory.h DictionarySourceFactory.h DictionaryStructure.h)
add_library(clickhouse_dictionaries ${LINK_MODE} ${clickhouse_dictionaries_sources})
add_library(clickhouse_dictionaries ${clickhouse_dictionaries_sources})
target_link_libraries(clickhouse_dictionaries PRIVATE dbms clickhouse_common_io ${BTRIE_LIBRARIES} PUBLIC Threads::Threads)
if(Poco_SQL_FOUND AND NOT USE_INTERNAL_POCO_LIBRARY)

View File

@ -1,5 +1,5 @@
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_dictionaries_embedded .)
add_headers_and_sources(clickhouse_dictionaries_embedded GeodataProviders)
add_library(clickhouse_dictionaries_embedded ${LINK_MODE} ${clickhouse_dictionaries_embedded_sources})
add_library(clickhouse_dictionaries_embedded ${clickhouse_dictionaries_embedded_sources})
target_link_libraries(clickhouse_dictionaries_embedded PRIVATE clickhouse_common_io ${MYSQLXX_LIBRARY})

View File

@ -19,17 +19,7 @@ namespace ErrorCodes
}
CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings)
: istr(istr_), header(header_), with_names(with_names_), format_settings(format_settings)
{
size_t num_columns = header.columns();
data_types.resize(num_columns);
for (size_t i = 0; i < num_columns; ++i)
data_types[i] = header.safeGetByPosition(i).type;
}
static void skipEndOfLine(ReadBuffer & istr)
static inline void skipEndOfLine(ReadBuffer & istr)
{
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
@ -53,7 +43,7 @@ static void skipEndOfLine(ReadBuffer & istr)
}
static void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column)
static inline void skipDelimiter(ReadBuffer & istr, const char delimiter, bool is_last_column)
{
if (is_last_column)
{
@ -99,38 +89,149 @@ static void skipRow(ReadBuffer & istr, const FormatSettings::CSV & settings, siz
}
CSVRowInputStream::CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings)
: istr(istr_), header(header_), with_names(with_names_), format_settings(format_settings)
{
const auto num_columns = header.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = header.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
column_indexes_for_input_fields.reserve(num_columns);
read_columns.assign(num_columns, false);
}
void CSVRowInputStream::setupAllColumnsByTableSchema()
{
read_columns.assign(header.columns(), true);
column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
{
column_indexes_for_input_fields[i] = i;
}
}
void CSVRowInputStream::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in CSV header: '" + column_name + "' " +
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_indexes_for_input_fields.emplace_back(column_index);
}
void CSVRowInputStream::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension)
{
/// It is safe to memorize this on the first run - the format guarantees this does not change
if (unlikely(row_num == 1))
{
columns_to_fill_with_default_values.clear();
for (size_t index = 0; index < read_columns.size(); ++index)
if (read_columns[index] == 0)
columns_to_fill_with_default_values.push_back(index);
}
for (const auto column_index : columns_to_fill_with_default_values)
data_types[column_index]->insertDefaultInto(*columns[column_index]);
row_read_extension.read_columns = read_columns;
}
void CSVRowInputStream::readPrefix()
{
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
skipBOMIfExists(istr);
size_t num_columns = data_types.size();
String tmp;
if (with_names)
skipRow(istr, format_settings.csv, num_columns);
{
if (format_settings.with_names_use_header)
{
String column_name;
do
{
skipWhitespacesAndTabs(istr);
readCSVString(column_name, istr, format_settings.csv);
skipWhitespacesAndTabs(istr);
addInputColumn(column_name);
}
while (checkChar(format_settings.csv.delimiter, istr));
skipDelimiter(istr, format_settings.csv.delimiter, true);
}
else
{
setupAllColumnsByTableSchema();
skipRow(istr, format_settings.csv, column_indexes_for_input_fields.size());
}
}
else
{
setupAllColumnsByTableSchema();
}
}
bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension &)
bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
{
if (istr.eof())
return false;
updateDiagnosticInfo();
size_t size = data_types.size();
for (size_t i = 0; i < size; ++i)
String tmp;
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
skipWhitespacesAndTabs(istr);
data_types[i]->deserializeAsTextCSV(*columns[i], istr, format_settings);
skipWhitespacesAndTabs(istr);
const auto & column_index = column_indexes_for_input_fields[input_position];
if (column_index)
{
skipWhitespacesAndTabs(istr);
data_types[*column_index]->deserializeAsTextCSV(*columns[*column_index], istr, format_settings);
skipWhitespacesAndTabs(istr);
}
else
{
readCSVString(tmp, istr, format_settings.csv);
}
skipDelimiter(istr, format_settings.csv.delimiter, i + 1 == size);
skipDelimiter(istr, format_settings.csv.delimiter, input_position + 1 == column_indexes_for_input_fields.size());
}
fillUnreadColumnsWithDefaults(columns, ext);
return true;
}
@ -202,86 +303,101 @@ bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumn
{
const char delimiter = format_settings.csv.delimiter;
size_t size = data_types.size();
for (size_t i = 0; i < size; ++i)
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
if (i == 0 && istr.eof())
if (input_position == 0 && istr.eof())
{
out << "<End of stream>\n";
return false;
}
out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(i).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(i).name.size(), ' ')
<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
BufferBase::Position prev_position = istr.position();
BufferBase::Position curr_position = istr.position();
std::exception_ptr exception;
try
if (column_indexes_for_input_fields[input_position].has_value())
{
skipWhitespacesAndTabs(istr);
prev_position = istr.position();
data_types[i]->deserializeAsTextCSV(*columns[i], istr, format_settings);
curr_position = istr.position();
skipWhitespacesAndTabs(istr);
}
catch (...)
{
exception = std::current_exception();
}
const auto & column_index = *column_indexes_for_input_fields[input_position];
const auto & current_column_type = data_types[column_index];
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
if (isNumber(data_types[i]) || isDateOrDateTime(data_types[i]))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
BufferBase::Position prev_position = istr.position();
BufferBase::Position curr_position = istr.position();
std::exception_ptr exception;
try
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << data_types[i]->getName() << "\n";
return false;
skipWhitespacesAndTabs(istr);
prev_position = istr.position();
current_column_type->deserializeAsTextCSV(*columns[column_index], istr, format_settings);
curr_position = istr.position();
skipWhitespacesAndTabs(istr);
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (data_types[i]->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (data_types[i]->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\r' && *curr_position != delimiter)
catch (...)
{
out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
exception = std::current_exception();
}
if (data_types[i]->getName() == "DateTime")
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\r' && *curr_position != delimiter)
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
String tmp;
readCSVString(tmp, istr, format_settings.csv);
}
/// Delimiters
if (i + 1 == size)
if (input_position + 1 == column_indexes_for_input_fields.size())
{
if (istr.eof())
return false;

View File

@ -1,5 +1,8 @@
#pragma once
#include <optional>
#include <unordered_map>
#include <Core/Block.h>
#include <Formats/IRowInputStream.h>
#include <Formats/FormatSettings.h>
@ -21,7 +24,7 @@ public:
*/
CSVRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, const FormatSettings & format_settings);
bool read(MutableColumns & columns, RowReadExtension &) override;
bool read(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
@ -36,6 +39,19 @@ private:
const FormatSettings format_settings;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
std::vector<UInt8> read_columns;
std::vector<size_t> columns_to_fill_with_default_values;
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
/// For convenient diagnostics in case of an error.
size_t row_num = 0;

View File

@ -40,6 +40,7 @@ BlockInputStreamPtr FormatFactory::getInput(const String & name, ReadBuffer & bu
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields;
format_settings.import_nested_json = settings.input_format_import_nested_json;
format_settings.date_time_input_format = settings.date_time_input_format;

View File

@ -48,6 +48,7 @@ struct FormatSettings
Values values;
bool skip_unknown_fields = false;
bool with_names_use_header = false;
bool write_statistics = true;
bool import_nested_json = false;

View File

@ -1,3 +1,5 @@
#include <string>
#include <Core/Defines.h>
#include <IO/ReadHelpers.h>
@ -20,46 +22,14 @@ namespace ErrorCodes
}
TabSeparatedRowInputStream::TabSeparatedRowInputStream(
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings)
: istr(istr_), header(header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings)
static void skipTSVRow(ReadBuffer & istr, const size_t num_columns)
{
size_t num_columns = header.columns();
data_types.resize(num_columns);
NullSink null_sink;
for (size_t i = 0; i < num_columns; ++i)
data_types[i] = header.safeGetByPosition(i).type;
}
void TabSeparatedRowInputStream::readPrefix()
{
size_t num_columns = header.columns();
String tmp;
if (with_names || with_types)
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(istr);
}
if (with_names)
{
for (size_t i = 0; i < num_columns; ++i)
{
readEscapedString(tmp, istr);
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
}
}
if (with_types)
{
for (size_t i = 0; i < num_columns; ++i)
{
readEscapedString(tmp, istr);
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
}
readEscapedStringInto(null_sink, istr);
assertChar(i == num_columns - 1 ? '\n' : '\t', istr);
}
}
@ -77,34 +47,165 @@ static void checkForCarriageReturn(ReadBuffer & istr)
}
bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension &)
TabSeparatedRowInputStream::TabSeparatedRowInputStream(
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings)
: istr(istr_), header(header_), with_names(with_names_), with_types(with_types_), format_settings(format_settings)
{
const auto num_columns = header.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = header.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
column_indexes_for_input_fields.reserve(num_columns);
read_columns.assign(num_columns, false);
}
void TabSeparatedRowInputStream::setupAllColumnsByTableSchema()
{
read_columns.assign(header.columns(), true);
column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
column_indexes_for_input_fields[i] = i;
}
void TabSeparatedRowInputStream::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in TSV header: '" + column_name + "' " +
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_indexes_for_input_fields.emplace_back(column_index);
}
void TabSeparatedRowInputStream::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension)
{
/// It is safe to memorize this on the first run - the format guarantees this does not change
if (unlikely(row_num == 1))
{
columns_to_fill_with_default_values.clear();
for (size_t index = 0; index < read_columns.size(); ++index)
if (read_columns[index] == 0)
columns_to_fill_with_default_values.push_back(index);
}
for (const auto column_index : columns_to_fill_with_default_values)
data_types[column_index]->insertDefaultInto(*columns[column_index]);
row_read_extension.read_columns = read_columns;
}
void TabSeparatedRowInputStream::readPrefix()
{
if (with_names || with_types)
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(istr);
}
if (with_names)
{
if (format_settings.with_names_use_header)
{
String column_name;
do
{
readEscapedString(column_name, istr);
addInputColumn(column_name);
}
while (checkChar('\t', istr));
if (!istr.eof())
{
checkForCarriageReturn(istr);
assertChar('\n', istr);
}
}
else
{
setupAllColumnsByTableSchema();
skipTSVRow(istr, column_indexes_for_input_fields.size());
}
}
else
setupAllColumnsByTableSchema();
if (with_types)
{
skipTSVRow(istr, column_indexes_for_input_fields.size());
}
}
bool TabSeparatedRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
{
if (istr.eof())
return false;
updateDiagnosticInfo();
size_t size = data_types.size();
for (size_t i = 0; i < size; ++i)
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
data_types[i]->deserializeAsTextEscaped(*columns[i], istr, format_settings);
/// skip separators
if (i + 1 == size)
const auto & column_index = column_indexes_for_input_fields[input_position];
if (column_index)
{
if (!istr.eof())
{
if (unlikely(row_num == 1))
checkForCarriageReturn(istr);
assertChar('\n', istr);
}
data_types[*column_index]->deserializeAsTextEscaped(*columns[*column_index], istr, format_settings);
}
else
{
NullSink null_sink;
readEscapedStringInto(null_sink, istr);
}
/// skip separators
if (input_position + 1 < column_indexes_for_input_fields.size())
{
assertChar('\t', istr);
}
else if (!istr.eof())
{
if (unlikely(row_num == 1))
checkForCarriageReturn(istr);
assertChar('\n', istr);
}
}
fillUnreadColumnsWithDefaults(columns, ext);
return true;
}
@ -135,7 +236,7 @@ String TabSeparatedRowInputStream::getDiagnosticInfo()
if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
/// Roll back the cursor to the beginning of the previous or current line and pars all over again. But now we derive detailed information.
/// Roll back the cursor to the beginning of the previous or current line and parse all over again. But now we derive detailed information.
if (pos_of_prev_row)
{
@ -173,83 +274,98 @@ String TabSeparatedRowInputStream::getDiagnosticInfo()
bool OPTIMIZE(1) TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
size_t size = data_types.size();
for (size_t i = 0; i < size; ++i)
for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
{
if (i == 0 && istr.eof())
if (input_position == 0 && istr.eof())
{
out << "<End of stream>\n";
return false;
}
out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(i).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(i).name.size(), ' ')
<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
auto prev_position = istr.position();
std::exception_ptr exception;
try
if (column_indexes_for_input_fields[input_position].has_value())
{
data_types[i]->deserializeAsTextEscaped(*columns[i], istr, format_settings);
}
catch (...)
{
exception = std::current_exception();
}
const auto & column_index = *column_indexes_for_input_fields[input_position];
const auto & current_column_type = data_types[column_index];
auto curr_position = istr.position();
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
auto prev_position = istr.position();
std::exception_ptr exception;
if (isNumber(data_types[i]) || isDateOrDateTime(data_types[i]))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
try
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << data_types[i]->getName() << "\n";
return false;
current_column_type->deserializeAsTextEscaped(*columns[column_index], istr, format_settings);
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (data_types[i]->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (data_types[i]->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\t')
catch (...)
{
out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
exception = std::current_exception();
}
if (data_types[i]->getName() == "DateTime")
auto curr_position = istr.position();
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (isNumber(current_column_type) || isDateOrDateTime(current_column_type))
{
/// An empty string instead of a value.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << current_column_type->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (current_column_type->haveMaximumSizeOfValue())
{
if (*curr_position != '\n' && *curr_position != '\t')
{
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
if (current_column_type->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (current_column_type->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
NullSink null_sink;
readEscapedStringInto(null_sink, istr);
}
/// Delimiters
if (i + 1 == size)
if (input_position + 1 == column_indexes_for_input_fields.size())
{
if (!istr.eof())
{

View File

@ -1,5 +1,8 @@
#pragma once
#include <optional>
#include <unordered_map>
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <Formats/IRowInputStream.h>
@ -22,7 +25,7 @@ public:
TabSeparatedRowInputStream(
ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_, const FormatSettings & format_settings);
bool read(MutableColumns & columns, RowReadExtension &) override;
bool read(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
@ -37,6 +40,19 @@ private:
const FormatSettings format_settings;
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
std::vector<UInt8> read_columns;
std::vector<size_t> columns_to_fill_with_default_values;
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension& ext);
/// For convenient diagnostics in case of an error.
size_t row_num = 0;

View File

@ -7,7 +7,7 @@ add_headers_and_sources(clickhouse_functions .)
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
add_library(clickhouse_functions ${LINK_MODE} ${clickhouse_functions_sources})
add_library(clickhouse_functions ${clickhouse_functions_sources})
target_link_libraries(clickhouse_functions
PUBLIC

View File

@ -36,7 +36,6 @@
#include <AggregateFunctions/parseAggregateFunctionParameters.h>
#include <Storages/StorageDistributed.h>
#include <Storages/StorageMemory.h>
#include <Storages/StorageJoin.h>
#include <DataStreams/copyData.h>

View File

@ -1,6 +1,6 @@
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_parsers .)
add_library(clickhouse_parsers ${LINK_MODE} ${clickhouse_parsers_headers} ${clickhouse_parsers_sources})
add_library(clickhouse_parsers ${clickhouse_parsers_headers} ${clickhouse_parsers_sources})
target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io)
target_include_directories(clickhouse_parsers PUBLIC ${DBMS_INCLUDE_DIR})

View File

@ -1,7 +1,7 @@
if(USE_RDKAFKA)
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_storage_kafka .)
add_library(clickhouse_storage_kafka ${LINK_MODE} ${clickhouse_storage_kafka_sources})
add_library(clickhouse_storage_kafka ${clickhouse_storage_kafka_sources})
target_link_libraries(clickhouse_storage_kafka PRIVATE clickhouse_common_io ${CPPKAFKA_LIBRARY} ${RDKAFKA_LIBRARY})
if(NOT USE_INTERNAL_RDKAFKA_LIBRARY)
target_include_directories(clickhouse_storage_kafka SYSTEM BEFORE PRIVATE ${RDKAFKA_INCLUDE_DIR})

View File

@ -11,5 +11,5 @@ configure_file (StorageSystemBuildOptions.generated.cpp.in ${CONFIG_BUILD})
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(storages_system .)
list (APPEND storages_system_sources ${CONFIG_BUILD})
add_library(clickhouse_storages_system ${LINK_MODE} ${storages_system_headers} ${storages_system_sources})
add_library(clickhouse_storages_system ${storages_system_headers} ${storages_system_sources})
target_link_libraries(clickhouse_storages_system PRIVATE dbms common string_utils clickhouse_common_zookeeper)

View File

@ -4,5 +4,5 @@ add_headers_and_sources(clickhouse_table_functions .)
list(REMOVE_ITEM clickhouse_table_functions_sources ITableFunction.cpp TableFunctionFactory.cpp)
list(REMOVE_ITEM clickhouse_table_functions_headers ITableFunction.h TableFunctionFactory.h)
add_library(clickhouse_table_functions ${LINK_MODE} ${clickhouse_table_functions_sources})
add_library(clickhouse_table_functions ${clickhouse_table_functions_sources})
target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_storages_system dbms ${Poco_Foundation_LIBRARY})

View File

@ -19,6 +19,8 @@ from errno import ESRCH
import termcolor
from random import random
import commands
from multiprocessing import Pool
from contextlib import closing
MESSAGES_TO_RETRY = [
@ -86,23 +88,161 @@ def get_server_pid(server_tcp_port):
except Exception as ex:
return None
def colored(text, args, color=None, on_color=None, attrs=None):
if sys.stdout.isatty() or args.force_color:
return termcolor.colored(text, color, on_color, attrs)
else:
return text
SERVER_DIED = False
exit_code = 0
#def run_tests_array(all_tests, suite, suite_dir, suite_tmp_dir, run_total):
def run_tests_array(all_tests_with_params):
all_tests, suite, suite_dir, suite_tmp_dir, run_total = all_tests_with_params
global exit_code
global SERVER_DIED
OP_SQUARE_BRACKET = colored("[", args, attrs=['bold'])
CL_SQUARE_BRACKET = colored("]", args, attrs=['bold'])
MSG_FAIL = OP_SQUARE_BRACKET + colored(" FAIL ", args, "red", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_UNKNOWN = OP_SQUARE_BRACKET + colored(" UNKNOWN ", args, "yellow", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_OK = OP_SQUARE_BRACKET + colored(" OK ", args, "green", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_SKIPPED = OP_SQUARE_BRACKET + colored(" SKIPPED ", args, "cyan", attrs=['bold']) + CL_SQUARE_BRACKET
passed_total = 0
skipped_total = 0
failures_total = 0
failures = 0
failures_chain = 0
if len(all_tests):
print("\nRunning {} {} tests.".format(len(all_tests), suite) + "\n")
for case in all_tests:
if SERVER_DIED:
break
case_file = os.path.join(suite_dir, case)
(name, ext) = os.path.splitext(case)
try:
sys.stdout.write("{0:72}".format(name + ": "))
if run_total == 1:
sys.stdout.flush()
if args.skip and any(s in name for s in args.skip):
print(MSG_SKIPPED + " - skip")
skipped_total += 1
elif not args.zookeeper and 'zookeeper' in name:
print(MSG_SKIPPED + " - no zookeeper")
skipped_total += 1
elif not args.shard and 'shard' in name:
print(MSG_SKIPPED + " - no shard")
skipped_total += 1
elif not args.no_long and 'long' in name:
print(MSG_SKIPPED + " - no long")
skipped_total += 1
else:
disabled_file = os.path.join(suite_dir, name) + '.disabled'
if os.path.exists(disabled_file) and not args.disabled:
message = open(disabled_file, 'r').read()
print(MSG_SKIPPED + " - " + message)
else:
if args.testname:
clickhouse_proc = Popen(shlex.split(args.client_with_database), stdin=PIPE, stdout=PIPE, stderr=PIPE)
clickhouse_proc.communicate("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite))
reference_file = os.path.join(suite_dir, name) + '.reference'
stdout_file = os.path.join(suite_tmp_dir, name) + '.stdout'
stderr_file = os.path.join(suite_tmp_dir, name) + '.stderr'
proc, stdout, stderr = run_single_test(args, ext, server_logs_level, case_file, stdout_file, stderr_file)
if proc.returncode is None:
try:
proc.kill()
except OSError as e:
if e.errno != ESRCH:
raise
failures += 1
print("{0} - Timeout!".format(MSG_FAIL))
else:
counter = 1
while proc.returncode != 0 and need_retry(stderr):
proc, stdout, stderr = run_single_test(args, ext, server_logs_level, case_file, stdout_file, stderr_file)
sleep(2**counter)
counter += 1
if counter > 6:
break
if proc.returncode != 0:
failures += 1
failures_chain += 1
print("{0} - return code {1}".format(MSG_FAIL, proc.returncode))
if stderr:
print(stderr.encode('utf-8'))
if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr:
SERVER_DIED = True
elif stderr:
failures += 1
failures_chain += 1
print("{0} - having stderror:\n{1}".format(MSG_FAIL, stderr.encode('utf-8')))
elif 'Exception' in stdout:
failures += 1
failures_chain += 1
print("{0} - having exception:\n{1}".format(MSG_FAIL, stdout.encode('utf-8')))
elif not os.path.isfile(reference_file):
print("{0} - no reference file".format(MSG_UNKNOWN))
else:
result_is_different = subprocess.call(['diff', '-q', reference_file, stdout_file], stdout = PIPE)
if result_is_different:
diff = Popen(['diff', '--unified', reference_file, stdout_file], stdout = PIPE).communicate()[0]
failures += 1
print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff))
else:
passed_total += 1
failures_chain = 0
print(MSG_OK)
if os.path.exists(stdout_file):
os.remove(stdout_file)
if os.path.exists(stderr_file):
os.remove(stderr_file)
except KeyboardInterrupt as e:
print(colored("Break tests execution", args, "red"))
raise e
except:
import traceback
exc_type, exc_value, tb = sys.exc_info()
failures += 1
print("{0} - Test internal error: {1}\n{2}\n{3}".format(MSG_FAIL, exc_type.__name__, exc_value, "\n".join(traceback.format_tb(tb, 10))))
if failures_chain >= 20:
break
failures_total = failures_total + failures
if failures_total > 0:
print(colored("\nHaving {failures_total} errors! {passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total, failures_total = failures_total), args, "red", attrs=["bold"]))
exit_code = 1
else:
print(colored("\n{passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total), args, "green", attrs=["bold"]))
server_logs_level = "warning"
def main(args):
SERVER_DIED = False
def colored(text, color=None, on_color=None, attrs=None):
if sys.stdout.isatty() or args.force_color:
return termcolor.colored(text, color, on_color, attrs)
else:
return text
OP_SQUARE_BRACKET = colored("[", attrs=['bold'])
CL_SQUARE_BRACKET = colored("]", attrs=['bold'])
MSG_FAIL = OP_SQUARE_BRACKET + colored(" FAIL ", "red", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_UNKNOWN = OP_SQUARE_BRACKET + colored(" UNKNOWN ", "yellow", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_OK = OP_SQUARE_BRACKET + colored(" OK ", "green", attrs=['bold']) + CL_SQUARE_BRACKET
MSG_SKIPPED = OP_SQUARE_BRACKET + colored(" SKIPPED ", "cyan", attrs=['bold']) + CL_SQUARE_BRACKET
global SERVER_DIED
global exit_code
global server_logs_level
def is_data_present():
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
@ -112,7 +252,6 @@ def main(args):
return stdout.startswith('1')
base_dir = os.path.abspath(args.queries)
tmp_dir = os.path.abspath(args.tmp)
@ -127,7 +266,6 @@ def main(args):
# Force to print server warnings in stderr
# Shell scripts could change logging level
server_logs_level = "warning"
os.environ.setdefault("CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL", server_logs_level)
if args.zookeeper is None:
@ -147,10 +285,6 @@ def main(args):
else:
args.shard = False
passed_total = 0
skipped_total = 0
failures_total = 0
clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
clickhouse_proc_create.communicate("CREATE DATABASE IF NOT EXISTS " + args.database)
if args.database != "test":
@ -192,9 +326,7 @@ def main(args):
suite = suite_re_obj.group(1)
if os.path.isdir(suite_dir):
failures = 0
failures_chain = 0
if 'stateful' in suite and not is_data_present():
if 'stateful' in suite and not args.no_stateful and not is_data_present():
print("Won't run stateful tests because test data wasn't loaded.")
continue
if 'stateless' in suite and args.no_stateless:
@ -222,150 +354,48 @@ def main(args):
except ValueError:
return 99997
run_n, run_total = args.parallel.split('/')
run_n = float(run_n)
run_total = float(run_total)
all_tests = os.listdir(suite_dir)
all_tests = filter(lambda case: is_test_from_dir(suite_dir, case), all_tests)
all_tests = sorted(filter(lambda case: re.search(args.test, case) if args.test else True, all_tests), key=key_func)
run_n, run_total = args.parallel.split('/')
run_n = float(run_n)
run_total = float(run_total)
tests_n = len(all_tests)
start = int(tests_n / run_total * (run_n - 1))
if start > 0:
start = start + 1
end = int(tests_n / run_total * (run_n))
all_tests = all_tests[start : end]
if run_total > tests_n:
run_total = tests_n
if run_n > run_total:
continue
print("\nRunning {} {} tests.".format(tests_n, suite) + (" {} .. {} ".format(start, end) if run_total > 1 else "") + "\n")
jobs = args.jobs
if jobs > run_total:
run_total = jobs
for case in all_tests:
if SERVER_DIED:
break
all_tests_array = []
for n in range(1, 1 + int(run_total)):
start = int(tests_n / run_total * (n - 1))
end = int(tests_n / run_total * n)
all_tests_array.append([all_tests[start : end], suite, suite_dir, suite_tmp_dir, run_total])
case_file = os.path.join(suite_dir, case)
(name, ext) = os.path.splitext(case)
try:
sys.stdout.write("{0:72}".format(name + ": "))
if run_total == 1:
sys.stdout.flush()
if args.skip and any(s in name for s in args.skip):
print(MSG_SKIPPED + " - skip")
skipped_total += 1
elif not args.zookeeper and 'zookeeper' in name:
print(MSG_SKIPPED + " - no zookeeper")
skipped_total += 1
elif not args.shard and 'shard' in name:
print(MSG_SKIPPED + " - no shard")
skipped_total += 1
elif not args.no_long and 'long' in name:
print(MSG_SKIPPED + " - no long")
skipped_total += 1
else:
disabled_file = os.path.join(suite_dir, name) + '.disabled'
if os.path.exists(disabled_file) and not args.disabled:
message = open(disabled_file, 'r').read()
print(MSG_SKIPPED + " - " + message)
else:
if args.testname:
clickhouse_proc = Popen(shlex.split(args.client_with_database), stdin=PIPE, stdout=PIPE, stderr=PIPE)
clickhouse_proc.communicate("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite))
reference_file = os.path.join(suite_dir, name) + '.reference'
stdout_file = os.path.join(suite_tmp_dir, name) + '.stdout'
stderr_file = os.path.join(suite_tmp_dir, name) + '.stderr'
proc, stdout, stderr = run_single_test(args, ext, server_logs_level, case_file, stdout_file, stderr_file)
if proc.returncode is None:
try:
proc.kill()
except OSError as e:
if e.errno != ESRCH:
raise
failures += 1
print("{0} - Timeout!".format(MSG_FAIL))
else:
counter = 1
while proc.returncode != 0 and need_retry(stderr):
proc, stdout, stderr = run_single_test(args, ext, server_logs_level, case_file, stdout_file, stderr_file)
sleep(2**counter)
counter += 1
if counter > 6:
break
if proc.returncode != 0:
failures += 1
failures_chain += 1
print("{0} - return code {1}".format(MSG_FAIL, proc.returncode))
if stderr:
print(stderr.encode('utf-8'))
if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr:
SERVER_DIED = True
elif stderr:
failures += 1
failures_chain += 1
print("{0} - having stderror:\n{1}".format(MSG_FAIL, stderr.encode('utf-8')))
elif 'Exception' in stdout:
failures += 1
failures_chain += 1
print("{0} - having exception:\n{1}".format(MSG_FAIL, stdout.encode('utf-8')))
elif not os.path.isfile(reference_file):
print("{0} - no reference file".format(MSG_UNKNOWN))
else:
result_is_different = subprocess.call(['diff', '-q', reference_file, stdout_file], stdout = PIPE)
if result_is_different:
diff = Popen(['diff', '--unified', reference_file, stdout_file], stdout = PIPE).communicate()[0]
failures += 1
print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff))
else:
passed_total += 1
failures_chain = 0
print(MSG_OK)
if os.path.exists(stdout_file):
os.remove(stdout_file)
if os.path.exists(stderr_file):
os.remove(stderr_file)
except KeyboardInterrupt as e:
print(colored("Break tests execution", "red"))
raise e
except:
import traceback
exc_type, exc_value, tb = sys.exc_info()
failures += 1
print("{0} - Test internal error: {1}\n{2}\n{3}".format(MSG_FAIL, exc_type.__name__, exc_value, "\n".join(traceback.format_tb(tb, 10))))
if failures_chain >= 20:
break
failures_total = failures_total + failures
exit_code = 0
if failures_total > 0:
print(colored("\nHaving {failures_total} errors! {passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total, failures_total = failures_total), "red", attrs=["bold"]))
exit_code = 1
else:
print(colored("\n{passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total), "green", attrs=["bold"]))
if jobs > 1:
with closing(Pool(processes=jobs)) as pool:
pool.map(run_tests_array, all_tests_array)
pool.terminate()
else:
run_tests_array(all_tests_array[int(run_n)-1])
if args.hung_check:
processlist = get_processlist(args.client_with_database)
if processlist:
server_pid = get_server_pid(os.getenv("CLICKHOUSE_PORT_TCP", '9000'))
print(colored("\nFound hung queries in processlist:", "red", attrs=["bold"]))
print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"]))
print(processlist)
if server_pid:
print("\nStacktraces of all threads:")
print(get_stacktraces(server_pid))
exit_code = 1
else:
print(colored("\nNo queries hung.", "green", attrs=["bold"]))
print(colored("\nNo queries hung.", args, "green", attrs=["bold"]))
sys.exit(exit_code)
@ -400,7 +430,8 @@ if __name__ == '__main__':
parser.add_argument('--hung-check', action='store_true', default=False)
parser.add_argument('--force-color', action='store_true', default=False)
parser.add_argument('--database', default='test', help='Default database for tests')
parser.add_argument('--parallel', default='1/1', help='Parralel test run number/total')
parser.add_argument('--parallel', default='1/1', help='One parallel test run number/total')
parser.add_argument('-j', '--jobs', default=1, help='Run all tests in parallel', type=int)
parser.add_argument('--no-stateless', action='store_true', help='Disable all stateless tests')
parser.add_argument('--no-stateful', action='store_true', help='Disable all stateful tests')

View File

@ -18,3 +18,23 @@
1000
1000
1000
10
10
10
10
10
10
10
10
10
10
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000

View File

@ -6,5 +6,7 @@ INSERT INTO test.group_uniq_str SELECT 5 as id, toString(number % 100) as v FROM
SELECT length(groupUniqArray(v)) FROM test.group_uniq_str GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(v)) FROM remote('127.0.0.{2,3,4,5}', 'test', 'group_uniq_str') GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(10)(v)) FROM test.group_uniq_str GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(10000)(v)) FROM test.group_uniq_str GROUP BY id ORDER BY id;
DROP TABLE IF EXISTS test.group_uniq_str;

View File

@ -18,3 +18,23 @@
20001
20001
20001
10
10
10
10
10
10
10
10
10
10
20001
20001
20001
20001
20001
20001
20001
20001
20001
20001

View File

@ -5,5 +5,8 @@ CREATE TABLE test.group_uniq_arr_int ENGINE = Memory AS
SELECT length(groupUniqArray(v)) FROM test.group_uniq_arr_int GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(v)) FROM remote('127.0.0.{2,3,4,5}', 'test', 'group_uniq_arr_int') GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(10)(v)) FROM test.group_uniq_arr_int GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(100000)(v)) FROM test.group_uniq_arr_int GROUP BY id ORDER BY id;
DROP TABLE IF EXISTS test.group_uniq_arr_int;

View File

@ -1 +1,5 @@
['2016-12-16'] ['2016-12-16 12:00:00']
2 2 3 3
1 1 1 1
3 3
1 1

View File

@ -1,5 +1,8 @@
DROP TABLE IF EXISTS grop_uniq_array_date;
CREATE TABLE grop_uniq_array_date (d Date, dt DateTime) ENGINE = Memory;
INSERT INTO grop_uniq_array_date VALUES (toDate('2016-12-16'), toDateTime('2016-12-16 12:00:00')) (toDate('2016-12-16'), toDateTime('2016-12-16 12:00:00'));
CREATE TABLE grop_uniq_array_date (d Date, dt DateTime, id Integer) ENGINE = Memory;
INSERT INTO grop_uniq_array_date VALUES (toDate('2016-12-16'), toDateTime('2016-12-16 12:00:00'), 1) (toDate('2016-12-16'), toDateTime('2016-12-16 12:00:00'), 1);
SELECT groupUniqArray(d), groupUniqArray(dt) FROM grop_uniq_array_date;
INSERT INTO grop_uniq_array_date VALUES (toDate('2016-12-17'), toDateTime('2016-12-17 12:00:00'), 1), (toDate('2016-12-18'), toDateTime('2016-12-18 12:00:00'), 1), (toDate('2016-12-16'), toDateTime('2016-12-16 12:00:00'), 2);
SELECT length(groupUniqArray(2)(d)), length(groupUniqArray(2)(dt)), length(groupUniqArray(d)), length(groupUniqArray(dt)) FROM grop_uniq_array_date GROUP BY id ORDER BY id;
SELECT length(groupUniqArray(10000)(d)), length(groupUniqArray(10000)(dt)) FROM grop_uniq_array_date GROUP BY id ORDER BY id;
DROP TABLE IF EXISTS grop_uniq_array_date;

View File

@ -0,0 +1,21 @@
2019-04-18 42 Line1
2019-04-18 42 Line2
2019-04-18 42 Line3
2019-04-18 42 Line4
2019-04-18 42 Line5
2019-04-18 42 Line6
2019-04-18 42 Line7
2019-04-18 42 Line8
2019-04-18 42 Line9
2019-04-18 1 Line10
2019-04-18 2 Line11
2019-04-18 1 Line12
2019-04-18 2 Line13
2019-04-18 1
2019-04-18 2
0000-00-00 1 Line16
0000-00-00 2 Line17
2019-04-18 0 Line18
2019-04-18 0 Line19
0000-00-00 0
0000-00-00 0

View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.csv"
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.csv (d Date, u UInt8, str String) ENGINE = TinyLog"
INSERT_QUERY='$CLICKHOUSE_CLIENT --query="INSERT INTO test.csv FORMAT CSVWithNames"'
USE_HEADER='--input_format_with_names_use_header=1'
SKIP_UNKNOWN='--input_format_skip_unknown_fields=1'
# Simple check for parsing
echo -ne 'd,u,str\n2019-04-18,42,Line1\n2019-04-18,42,Line2' | eval $INSERT_QUERY
echo -ne 'd,u,str\n2019-04-18,42,Line3\n2019-04-18,42,Line4' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'd,u,str\n2019-04-18,42,Line5\n2019-04-18,42,Line6' | eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
# Random order of fields
echo -ne 'u,d,str\n42,2019-04-18,Line7\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'u,str,d\n42,Line8,2019-04-18\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'str,u,d\nLine9,42,2019-04-18\n' | eval $INSERT_QUERY $USE_HEADER
# Excessive fields
echo -ne 'd,u,str,more,unknown,fields\n2019-04-18,1,Line10,,,\n2019-04-18,2,Line11,,,\n' \
| eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
echo -ne 'd,unknown,str,more,u,fields\n2019-04-18,blahblah,Line12,,1,\n2019-04-18,,Line13,blahblah,2,\n' \
| eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
# Missing fields (defaults)
echo -ne 'd,u\n2019-04-18,1\n2019-04-18,2\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'str,u\nLine16,1\nLine17,2\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'd,str\n2019-04-18,Line18\n2019-04-18,Line19\n'| eval $INSERT_QUERY $USE_HEADER
echo -ne 'unknown\n\n\n' | eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.csv"
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.csv"

View File

@ -0,0 +1,21 @@
2019-04-18 42 Line1
2019-04-18 42 Line2
2019-04-18 42 Line3
2019-04-18 42 Line4
2019-04-18 42 Line5
2019-04-18 42 Line6
2019-04-18 42 Line7
2019-04-18 42 Line8
2019-04-18 42 Line9
2019-04-18 1 Line10
2019-04-18 2 Line11
2019-04-18 1 Line12
2019-04-18 2 Line13
2019-04-18 1
2019-04-18 2
0000-00-00 1 Line16
0000-00-00 2 Line17
2019-04-18 0 Line18
2019-04-18 0 Line19
0000-00-00 0
0000-00-00 0

View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.tsv"
$CLICKHOUSE_CLIENT --query="CREATE TABLE test.tsv (d Date, u UInt8, str String) ENGINE = TinyLog"
INSERT_QUERY='$CLICKHOUSE_CLIENT --query="INSERT INTO test.tsv FORMAT TSVWithNames"'
USE_HEADER='--input_format_with_names_use_header=1'
SKIP_UNKNOWN='--input_format_skip_unknown_fields=1'
# Simple check for parsing
echo -ne 'd\tu\tstr\n2019-04-18\t42\tLine1\n2019-04-18\t42\tLine2' | eval $INSERT_QUERY
echo -ne 'd\tu\tstr\n2019-04-18\t42\tLine3\n2019-04-18\t42\tLine4' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'd\tu\tstr\n2019-04-18\t42\tLine5\n2019-04-18\t42\tLine6' | eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
# Random order of fields
echo -ne 'u\td\tstr\n42\t2019-04-18\tLine7\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'u\tstr\td\n42\tLine8\t2019-04-18\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'str\tu\td\nLine9\t42\t2019-04-18\n' | eval $INSERT_QUERY $USE_HEADER
# Excessive fields
echo -ne 'd\tu\tstr\tmore\tunknown\tfields\n2019-04-18\t1\tLine10\t\t\t\n2019-04-18\t2\tLine11\t\t\t\n' \
| eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
echo -ne 'd\tunknown\tstr\tmore\tu\tfields\n2019-04-18\tblahblah\tLine12\t\t1\t\n2019-04-18\t\tLine13\tblahblah\t2\t\n' \
| eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
# Missing fields (defaults)
echo -ne 'd\tu\n2019-04-18\t1\n2019-04-18\t2\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'str\tu\nLine16\t1\nLine17\t2\n' | eval $INSERT_QUERY $USE_HEADER
echo -ne 'd\tstr\n2019-04-18\tLine18\n2019-04-18\tLine19\n'| eval $INSERT_QUERY $USE_HEADER
echo -ne 'unknown\n\n\n' | eval $INSERT_QUERY $USE_HEADER $SKIP_UNKNOWN
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.tsv"
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.tsv"

View File

@ -0,0 +1,4 @@
FROM centos:5
CMD /bin/sh -c "/clickhouse server --config /config/config.xml > /var/log/clickhouse-server/stderr.log 2>&1 & \
sleep 5 && /clickhouse client --query \"select 'OK'\" 2> /var/log/clickhouse-server/clientstderr.log || echo 'FAIL'"

View File

@ -0,0 +1,4 @@
FROM ubuntu:12.04
CMD /bin/sh -c "/clickhouse server --config /config/config.xml > /var/log/clickhouse-server/stderr.log 2>&1 & \
sleep 5 && /clickhouse client --query \"select 'OK'\" 2> /var/log/clickhouse-server/clientstderr.log || echo 'FAIL'"

View File

@ -18,7 +18,9 @@ RUN apt-get update -y \
openssl \
netcat-openbsd \
telnet \
llvm-8
llvm-8 \
brotli
COPY ./stress /stress
COPY log_queries.xml /etc/clickhouse-server/users.d/log_queries.xml

View File

@ -11,7 +11,7 @@ import time
def run_perf_test(cmd, xmls_path, output_folder):
output_path = os.path.join(output_folder, "perf_stress_run.txt")
f = open(output_path, 'w')
p = Popen("{} --skip-tags=long --r {}".format(cmd, xmls_path), shell=True, stdout=f, stderr=f)
p = Popen("{} --skip-tags=long --recursive --input-files {}".format(cmd, xmls_path), shell=True, stdout=f, stderr=f)
return p
def run_func_test(cmd, output_prefix, num_processes):
@ -55,7 +55,7 @@ if __name__ == "__main__":
while True:
retcodes = []
for p in func_pipes:
if p.poll():
if p.poll() is not None:
retcodes.append(p.returncode)
if len(retcodes) == len(func_pipes):
break

View File

@ -86,7 +86,7 @@ SELECT key, sum(value) FROM summtt GROUP BY key
```
## Data Processing
## Data Processing {#data-processing}
When data are inserted into a table, they are saved as-is. Clickhouse merges the inserted parts of data periodically and this is when rows with the same primary key are summed and replaced with one for each resulting part of data.

View File

@ -369,10 +369,13 @@ Optional parameters:
- The default value for substituting in empty positions.
- The length of the resulting array. This allows you to receive arrays of the same size for all the aggregate keys. When using this parameter, the default value must be specified.
## groupUniqArray(x)
## groupUniqArray(x), groupUniqArray(max_size)(x)
Creates an array from different argument values. Memory consumption is the same as for the `uniqExact` function.
The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements.
For example, `groupUniqArray (1) (x)` is equivalent to `[any (x)]`.
## quantile(level)(x)
Approximates the `level` quantile. `level` is a constant, a floating-point number from 0 to 1.

View File

@ -1,9 +1,9 @@
# Dictionary
The `Dictionary` engine displays the dictionary data as a ClickHouse table.
`Dictionary` 引擎将字典数据展示为一个ClickHouse的表。
As an example, consider a dictionary of `products` with the following configuration:
例如,考虑使用一个具有以下配置的 `products` 字典:
```xml
<dictionaries>
@ -36,7 +36,7 @@ As an example, consider a dictionary of `products` with the following configurat
</dictionaries>
```
Query the dictionary data:
查询字典中的数据:
``` sql
select name, type, key, attribute.names, attribute.types, bytes_allocated, element_count,source from system.dictionaries where name = 'products';
@ -60,17 +60,17 @@ WHERE name = 'products'
└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘
```
You can use the [dictGet*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) function to get the dictionary data in this format.
你可以使用 [dictGet*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) 函数来获取这种格式的字典数据。
This view isn't helpful when you need to get raw data, or when performing a `JOIN` operation. For these cases, you can use the `Dictionary` engine, which displays the dictionary data in a table.
当你需要获取原始数据,或者是想要使用 `JOIN` 操作的时候,这种视图并没有什么帮助。对于这些情况,你可以使用 `Dictionary` 引擎,它可以将字典数据展示在表中。
Syntax:
语法:
```
CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)`
```
Usage example:
示例:
``` sql
create table products (product_id UInt64, title String) Engine = Dictionary(products);
@ -89,7 +89,7 @@ Ok.
0 rows in set. Elapsed: 0.004 sec.
```
Take a look at what's in the table.
看一看表中的内容。
``` sql
select * from products limit 1;
@ -108,4 +108,4 @@ LIMIT 1
```
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/dictionary/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/dictionary/) <!--hide-->

View File

@ -87,7 +87,7 @@ ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDa
<details markdown="1"><summary>已弃用的建表方法</summary>
!!! 注意
!!! attention "注意"
不要在新版项目中使用该方法,可能的话,请将旧项目切换到上述方法。
```

View File

@ -1,12 +1,12 @@
# ReplacingMergeTree
The engine differs from [MergeTree](mergetree.md) in that it removes duplicate entries with the same primary key value.
该引擎和[MergeTree](mergetree.md)的不同之处在于它会删除具有相同主键的重复项。
Data deduplication occurs only during a merge. Merging occurs in the background at an unknown time, so you can't plan for it. Some of the data may remain unprocessed. Although you can run an unscheduled merge using the `OPTIMIZE` query, don't count on using it, because the `OPTIMIZE` query will read and write a large amount of data.
数据的去重只会在合并的过程中出现。合并会在未知的时间在后台进行,因此你无法预先作出计划。有一些数据可能仍未被处理。尽管你可以调用 `OPTIMIZE` 语句发起计划外的合并,但请不要指望使用它,因为 `OPTIMIZE` 语句会引发对大量数据的读和写。
Thus, `ReplacingMergeTree` is suitable for clearing out duplicate data in the background in order to save space, but it doesn't guarantee the absence of duplicates.
因此,`ReplacingMergeTree` 适用于在后台清除重复的数据以节省空间,但是它不保证没有重复的数据出现。
## Creating a Table
## 建表
```sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -21,24 +21,24 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
For a description of request parameters, see [request description](../../query_language/create.md).
请求参数的描述,参考[请求参数](../../query_language/create.md)。
**ReplacingMergeTree Parameters**
- `ver`column with version. Type `UInt*`, `Date` or `DateTime`. Optional parameter.
- `ver`版本列。类型为 `UInt*`, `Date``DateTime`。可选参数。
When merging, `ReplacingMergeTree` from all the rows with the same primary key leaves only one:
- Last in the selection, if `ver` not set.
- With the maximum version, if `ver` specified.
合并的时候,`ReplacingMergeTree` 从所有具有相同主键的行中选择一行留下:
- 如果 `ver` 列未指定,选择最后一条。
- 如果 `ver` 列已指定,选择 `ver` 值最大的版本。
**Query clauses**
**子句**
When creating a `ReplacingMergeTree` table the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
创建 `ReplacingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md)。
<details markdown="1"><summary>Deprecated Method for Creating a Table</summary>
<details markdown="1"><summary>已弃用的建表方法</summary>
!!! attention
Do not use this method in new projects and, if possible, switch the old projects to the method described above.
!!! attention "注意"
不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。
```sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -49,10 +49,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE [=] ReplacingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [ver])
```
All of the parameters excepting `ver` have the same meaning as in `MergeTree`.
除了 `ver` 的所有参数都与 `MergeTree` 中的含义相同。
- `ver` - column with the version. Optional parameter. For a description, see the text above.
- `ver` - 版本列。可选参数,有关说明,请参阅上文。
</details>
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/replacingmergetree/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/replacingmergetree/) <!--hide-->

View File

@ -1,11 +1,11 @@
# SummingMergeTree
The engine inherits from [MergeTree](mergetree.md). The difference is that when merging data parts for `SummingMergeTree` tables ClickHouse replaces all the rows with the same primary key with one row which contains summarized values for the columns with the numeric data type. If the primary key is composed in a way that a single key value corresponds to large number of rows, this significantly reduces storage volume and speeds up data selection.
该引擎继承自 [MergeTree](mergetree.md)。区别在于,当合并 `SummingMergeTree` 表的数据片段时ClickHouse 会把所有具有相同主键的行合并为一行,该行包含了被合并的行中具有数值数据类型的列的汇总值。如果主键的组合方式使得单个键值对应于大量的行,则可以显著的减少存储空间并加快数据查询的速度。
We recommend to use the engine together with `MergeTree`. Store complete data in `MergeTree` table, and use `SummingMergeTree` for aggregated data storing, for example, when preparing reports. Such an approach will prevent you from losing valuable data due to an incorrectly composed primary key.
我们推荐将该引擎和 `MergeTree` 一起使用。例如,在准备做报告的时候,将完整的数据存储在 `MergeTree` 表中,并且使用 `SummingMergeTree` 来存储聚合数据。这种方法可以使你避免因为使用不正确的主键组合方式而丢失有价值的数据。
## Creating a Table
## 建表
```
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -20,23 +20,23 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
For a description of request parameters, see [request description](../../query_language/create.md).
请求参数的描述,参考 [请求描述](../../query_language/create.md)。
**Parameters of SummingMergeTree**
**SummingMergeTree 的参数**
- `columns` - a tuple with the names of columns where values will be summarized. Optional parameter.
The columns must be of a numeric type and must not be in the primary key.
- `columns` - 包含了将要被汇总的列的列名的元组。可选参数。
所选的列必须是数值类型,并且不可位于主键中。
If `columns` not specified, ClickHouse summarizes the values in all columns with a numeric data type that are not in the primary key.
如果没有指定 `columns`ClickHouse 会把所有不在主键中的数值类型的列都进行汇总。
**Query clauses**
**子句**
When creating a `SummingMergeTree` table the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
创建 `SummingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md)。
<details markdown="1"><summary>Deprecated Method for Creating a Table</summary>
<details markdown="1"><summary>已弃用的建表方法</summary>
!!! attention
Do not use this method in new projects and, if possible, switch the old projects to the method described above.
!!! attention "注意"
不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。
```
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -47,14 +47,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns])
```
All of the parameters excepting `columns` have the same meaning as in `MergeTree`.
`columns` 外的所有参数都与 `MergeTree` 中的含义相同。
- `columns`tuple with names of columns values of which will be summarized. Optional parameter. For a description, see the text above.
- `columns`包含将要被汇总的列的列名的元组。可选参数。有关说明,请参阅上文。
</details>
## Usage Example
## 用法示例
Consider the following table:
考虑如下的表:
```sql
CREATE TABLE summtt
@ -66,13 +66,13 @@ ENGINE = SummingMergeTree()
ORDER BY key
```
Insert data to it:
向其中插入数据:
```
:) INSERT INTO summtt Values(1,1),(1,2),(2,1)
```
ClickHouse may sum all the rows not completely ([see below](#data-processing)), so we use an aggregate function `sum` and `GROUP BY` clause in the query.
ClickHouse可能不会完整的汇总所有行([见下文](#data-processing),因此我们在查询中使用了聚合函数 `sum``GROUP BY` 子句。
```sql
SELECT key, sum(value) FROM summtt GROUP BY key
@ -86,38 +86,38 @@ SELECT key, sum(value) FROM summtt GROUP BY key
```
## Data Processing
## 数据处理 {#data-processing}
When data are inserted into a table, they are saved as-is. Clickhouse merges the inserted parts of data periodically and this is when rows with the same primary key are summed and replaced with one for each resulting part of data.
当数据被插入到表中时他们将被原样保存。ClickHouse 定期合并插入的数据片段,并在这个时候对所有具有相同主键的行中的列进行汇总,将这些行替换为包含汇总数据的一行记录。
ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above.
ClickHouse 会按片段合并数据,以至于不同的数据片段中会包含具有相同主键的行,即单个汇总片段将会是不完整的。因此,聚合函数 [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) 和 `GROUP BY` 子句应该在(`SELECT`)查询语句中被使用,如上文中的例子所述。
### Common rules for summation
### 汇总的通用规则
The values in the columns with the numeric data type are summarized. The set of columns is defined by the parameter `columns`.
列中数值类型的值会被汇总。这些列的集合在参数 `columns` 中被定义。
If the values were 0 in all of the columns for summation, the row is deleted.
如果用于汇总的所有列中的值均为0则该行会被删除。
If column is not in the primary key and is not summarized, an arbitrary value is selected from the existing ones.
如果列不在主键中且无法被汇总,则会在现有的值中任选一个。
The values are not summarized for columns in the primary key.
主键所在的列中的值不会被汇总。
### The Summation in the AggregateFunction Columns
### AggregateFunction 列中的汇总
For columns of [AggregateFunction type](../../data_types/nested_data_structures/aggregatefunction.md) ClickHouse behaves as [AggregatingMergeTree](aggregatingmergetree.md) engine aggregating according to the function.
对于 [AggregateFunction 类型](../../data_types/nested_data_structures/aggregatefunction.md)的列ClickHouse 根据对应函数表现为 [AggregatingMergeTree](aggregatingmergetree.md) 引擎的聚合。
### Nested Structures
### 嵌套结构
Table can have nested data structures that are processed in a special way.
表中可以具有以特殊方式处理的嵌套数据结构。
If the name of a nested table ends with `Map` and it contains at least two columns that meet the following criteria:
如果嵌套表的名称以 `Map` 结尾,并且包含至少两个符合以下条件的列:
- the first column is numeric `(*Int*, Date, DateTime)`, let's call it `key`,
- the other columns are arithmetic `(*Int*, Float32/64)`, let's call it `(values...)`,
- 第一列是数值类型 `(*Int*, Date, DateTime)`,我们称之为 `key`,
- 其他的列是可计算的 `(*Int*, Float32/64)`,我们称之为 `(values...)`,
then this nested table is interpreted as a mapping of `key => (values...)`, and when merging its rows, the elements of two data sets are merged by `key` with a summation of the corresponding `(values...)`.
然后这个嵌套表会被解释为一个 `key => (values...)` 的映射,当合并它们的行时,两个数据集中的元素会被根据 `key` 合并为相应的 `(values...)` 的汇总值。
Examples:
示例:
```
[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)]
@ -126,8 +126,8 @@ Examples:
[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)]
```
When requesting data, use the [sumMap(key, value)](../../query_language/agg_functions/reference.md) function for aggregation of `Map`.
请求数据时,使用 [sumMap(key, value)](../../query_language/agg_functions/reference.md) 函数来对 `Map` 进行聚合。
For nested data structure, you do not need to specify its columns in the tuple of columns for summation.
对于嵌套数据结构,你无需在列的元组中指定列以进行汇总。
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/summingmergetree/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/summingmergetree/) <!--hide-->

View File

@ -1,7 +1,7 @@
set (CONFIG_COMMON ${CMAKE_CURRENT_BINARY_DIR}/include/common/config_common.h)
configure_file (${CMAKE_CURRENT_SOURCE_DIR}/include/common/config_common.h.in ${CONFIG_COMMON})
add_library (apple_rt
add_library(apple_rt
src/apple_rt.cpp
include/port/clock.h
)
@ -10,7 +10,7 @@ if (DEFINED APPLE_HAVE_CLOCK_GETTIME)
target_compile_definitions(apple_rt PUBLIC -DAPPLE_HAVE_CLOCK_GETTIME=${APPLE_HAVE_CLOCK_GETTIME})
endif ()
add_library (common ${LINK_MODE}
add_library(common
src/DateLUT.cpp
src/DateLUTImpl.cpp
src/preciseExp10.c
@ -91,9 +91,13 @@ target_include_directories (common BEFORE PRIVATE ${CCTZ_INCLUDE_DIR})
target_include_directories (common PUBLIC ${COMMON_INCLUDE_DIR})
if (NOT USE_INTERNAL_BOOST_LIBRARY)
target_include_directories (common BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories (common SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})
endif ()
if(NOT USE_INTERNAL_POCO_LIBRARY)
target_include_directories (common SYSTEM BEFORE PUBLIC ${Poco_Foundation_INCLUDE_DIR})
endif()
target_link_libraries (common
PUBLIC
${Poco_Foundation_LIBRARY}

View File

@ -1,4 +1,4 @@
add_library (daemon ${LINK_MODE}
add_library (daemon
src/BaseDaemon.cpp
src/GraphiteWriter.cpp
src/ExtendedLogChannel.cpp
@ -22,4 +22,4 @@ endif ()
target_include_directories (daemon PUBLIC include)
target_link_libraries (daemon PRIVATE clickhouse_common_io clickhouse_common_config common ${Poco_Net_LIBRARY} ${Poco_Util_LIBRARY} ${EXECINFO_LIBRARY} ${ELF_LIBRARY})
target_link_libraries (daemon PRIVATE clickhouse_common_io clickhouse_common_config common ${Poco_Net_LIBRARY} ${Poco_Util_LIBRARY} ${EXECINFO_LIBRARIES})

View File

@ -1 +1 @@
add_library (memcpy memcpy.c)
add_library(memcpy memcpy.c)

View File

@ -1,4 +1,4 @@
add_library (mysqlxx ${LINK_MODE}
add_library (mysqlxx
src/Connection.cpp
src/Exception.cpp
src/Query.cpp

View File

@ -1,17 +1,17 @@
option (ENABLE_MYSQL "Enable MySQL" ${OS_LINUX})
if (OS_LINUX)
option (USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" ${NOT_UNBUNDLED})
else ()
option (USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" OFF)
endif ()
option(ENABLE_MYSQL "Enable MySQL" 1)
if(ENABLE_MYSQL)
if(OS_LINUX)
option(USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" ${NOT_UNBUNDLED})
else()
option(USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" OFF)
endif()
if (USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README.md")
message (WARNING "submodule contrib/mariadb-connector-c is missing. to fix try run: \n git submodule update --init --recursive")
set (USE_INTERNAL_MYSQL_LIBRARY 0)
endif ()
if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README.md")
message(WARNING "submodule contrib/mariadb-connector-c is missing. to fix try run: \n git submodule update --init --recursive")
set(USE_INTERNAL_MYSQL_LIBRARY 0)
endif()
if (ENABLE_MYSQL)
if (USE_INTERNAL_MYSQL_LIBRARY)
set (MYSQLCLIENT_LIBRARIES mysqlclient)
set (USE_MYSQL 1)

View File

@ -124,7 +124,7 @@ if __name__ == "__main__":
args.dataset_name, 'partitions', os.path.basename(file_path))
elif args.s3_path is not None:
s3_path = os.path.join(
args.dataset_name, s3_path, os.path.base_name(file_path))
args.dataset_name, args.s3_path, os.path.basename(file_path))
else:
raise Exception("Don't know s3-path to upload")

View File

@ -447,6 +447,8 @@ clickhouse-client
<li>Follow official <a
href="https://twitter.com/ClickHouseDB"
rel="external nofollow" target="_blank">Twitter account</a>.</li>
<li>Open <a href="https://github.com/yandex/ClickHouse/issues/new/choose"
rel="external nofollow" target="_blank">GitHub issue</a> if you have a bug report or feature request.</li>
<li>Or email Yandex ClickHouse team directly at
<a id="feedback_email" href="">turn on JavaScript to see email address</a>.
You can also <a href="https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/" target="_blank" rel="external nofollow">fill this form</a> to meet us in person.</li>